From 27cab47f15b031d8ee75db43ea9de183d25b0ccf Mon Sep 17 00:00:00 2001 From: Robin Kobus <19427718+Funatiq@users.noreply.github.com> Date: Mon, 9 Mar 2026 17:06:30 +0100 Subject: [PATCH 1/9] [https://nvbugs/5924144][test] unwaive cpp/test_unit_tests.py::test_unit_tests[kernels-80] (#11902) Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index b87997f01a2..22225fc32bd 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -329,7 +329,6 @@ unittest/_torch/visual_gen/test_wan.py::TestWanTwoStageTransformer::test_two_sta disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5920761) accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency_default] SKIP (https://nvbugs/5920751) accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[fp8-1-trtllm] SKIP (https://nvbugs/5921674) -cpp/test_unit_tests.py::test_unit_tests[kernels-80] SKIP (https://nvbugs/5924144) full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5929339) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True] SKIP (https://nvbugs/5879614) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance-eagle3_one_model=True] SKIP (https://nvbugs/5893116) From 7a68c42a235a5f40b811c29556fe684b2f6a2e73 Mon Sep 17 00:00:00 2001 From: tburt-nv <195370667+tburt-nv@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:03:46 -0400 Subject: [PATCH 2/9] [None][chore] limit tileiras to CUDA13.1 (#12042) Signed-off-by: Tyler Burt <195370667+tburt-nv@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c41db79b650..df670e33e94 100644 --- a/requirements.txt +++ b/requirements.txt @@ -81,7 +81,7 @@ torchao>=0.14.1,<0.16.0 cuda-core llist cuda-tile>=1.0.1 -nvidia-cuda-tileiras>=13.1 +nvidia-cuda-tileiras>=13.1,<13.2 etcd-sdk-python==0.0.7 python-multipart smg-grpc-proto>=0.4.2 From ae2dc3d671002d75a9e6163e8cae4e4a7c3752ff Mon Sep 17 00:00:00 2001 From: Lain Date: Mon, 9 Mar 2026 10:06:49 -0700 Subject: [PATCH 3/9] [None][feat] Add silu to trtllm-gen MoE (#11663) Signed-off-by: Siyuan Fu --- .../batchedGemm/KernelRunner.h | 7 +- .../BatchedGemmInterface.h | 18 +- .../trtllmGen_bmm_export/BatchedGemmOptions.h | 62 +- .../batchedGemm/trtllmGen_bmm_export/Enums.h | 4 + .../trtllmGen_bmm_export/GemmOptions.h | 147 +- .../trtllmGen_bmm_export/KernelMetaInfo.h | 16674 ++++++++++++---- .../trtllmGen_bmm_export/KernelParams.h | 35 +- .../trtllmGen_bmm_export/KernelParamsDecl.h | 2 +- .../trtllmGen_bmm_export/KernelTraits.h | 413 +- .../trtllmGen_bmm_export/TmaDescriptor.h | 1 + .../trtllmGen_bmm_export/config.json | 18 +- ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...p_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...p_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...p_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...p_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ..._biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...ma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp} | 4 +- ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...mp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...mp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ..._ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...aSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 - ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 + ...aSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp} | 4 +- ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 + ...Tma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp | 3 - ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 + ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...maSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp | 3 - ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 + ...sOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...chedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp | 3 - ...p_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp | 3 + ...p_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 + ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - ...Sf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp | 3 - .../trtllm/gen/CudaArchDecl.h | 1 + .../trtllm/gen/CudaKernelLauncher.h | 62 +- .../trtllmGen_bmm_export/trtllm/gen/MmaDecl.h | 9 +- .../trtllm/gen/SfLayoutDecl.h | 8 + .../trtllmGenKernels/blockScaleMoe/runner.cu | 9 +- jenkins/L0_MergeRequest.groovy | 4 +- .../custom_ops/fused_moe/trtllm_moe.py | 14 +- .../modules/fused_moe/fused_moe_trtllm_gen.py | 11 +- .../_torch/modules/fused_moe/quantization.py | 7 +- tensorrt_llm/_torch/utils.py | 1 + tests/unittest/_torch/thop/serial/test_moe.py | 19 +- 1101 files changed, 14908 insertions(+), 5857 deletions(-) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp rename cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp => Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp} (81%) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp rename cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp => Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp} (81%) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h index 24d34154017..21c1b80d985 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h @@ -42,7 +42,8 @@ enum class ActType // // GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0. SwiGlu, - Relu2 + Relu2, + Silu }; // Type of the element-wise activation to apply after the Gemm @@ -59,6 +60,10 @@ enum class EltwiseActType // act = relu(x0) ^ 2 // where x0 is the output of the Gemm. Relu2, + // Silu is defined as the following operation: + // act = x0 * sigmoid(x0) + // where x0 is the output of the Gemm. + Silu }; struct TrtllmGenBatchedGemmRunnerOptions diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index a84b863cdc0..0f14135427f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -141,10 +141,10 @@ struct BatchedGemmData // The rightmost dimension is contiguous in memory. // // If DeepSeek FP8 recipe is not used, but for MxFp{4,8}, MxInt4 and NvFp4 formats: - // The layout of scaling factors for A is always R128c4 + // If the layout is R128c4, // M must be a multiple of 128. - // K must be a multiple of 64. - // The "logical" shape is: [paddedM, K / P], where P is the scaling block size. + // K must be a multiple of 4 * P, where P is the scaling block size. + // The "logical" shape is: [paddedM, K / P]. // The R128c4 layout is: [paddedM / 128, K / P / 4, 512]. // The shape we use for TMA is: [paddedM / 128, K / P / 4, 2, 256]. // Where paddedM is M if (routeAct == true && batchM), or @@ -302,7 +302,7 @@ struct BatchedGemmData // The pre-activation scaling factor (typically dequantA * dequantB) for non-gated non-linear // activation. - // Only used when non-linear activation is applied (e.g., GELU, Relu2). + // Only used when non-linear activation is applied (e.g., GELU, Relu2, Silu). // When used, scaleC should be quantScaleC only, and this scale is applied before the // activation. Shape is [B]. float const* mPtrScaleAct{nullptr}; @@ -786,7 +786,7 @@ class BatchedGemmInterface { numCtasBatch += batchM ? gemm::divUp(options.mBatchedM[bi], options.mTileM * options.mClusterDimX) * options.mClusterDimX - : gemm::divUp(options.mBatchedN[bi], options.mTileN); + : gemm::divUp(options.mBatchedN[bi], options.mTileN * options.mClusterDimY) * options.mClusterDimY; } } // For MoE, mNumTokens != 0 and the number of CTAs is known only at runtime. @@ -923,19 +923,21 @@ class BatchedGemmInterface { totalNumPaddedTokens += batchM ? gemm::divUpMul(options.mBatchedM[bi], options.mTileM * options.mClusterDimX) - : gemm::divUpMul(options.mBatchedN[bi], options.mTileN); + : gemm::divUpMul(options.mBatchedN[bi], options.mTileN * options.mClusterDimY); } } else { // Get tile in token dim. - auto tileTokensDim = batchM ? options.mTileM * options.mClusterDimX : options.mTileN; + auto tileTokensDim + = batchM ? options.mTileM * options.mClusterDimX : options.mTileN * options.mClusterDimY; totalNumPaddedTokens = data.mProblemDimensions.mMaxNumCtasInTokenDim * tileTokensDim; } // Get options from config. auto& options = config.mOptions; - int const tokenTile = batchM ? options.mTileM * options.mClusterDimX : options.mTileN; + int const tokenTile + = batchM ? options.mTileM * options.mClusterDimX : options.mTileN * options.mClusterDimY; auto const numTokens = totalNumPaddedTokens; auto const intermediateDim = batchM ? options.mN : options.mM; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h index b78600aebfa..981aae7609e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h @@ -100,18 +100,18 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, gemm::EltwiseActType eltwiseActType, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, - bool fuseUtccpWithUtcmma, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, - bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, - bool hoistMmaTaskTryWaits, int k, gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, - gemm::MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, - int numEpilogueWarps, int numRegsCastAWarps, int numRegsCopySfLdsSttm, int numRegsCopySparsityInfo, - int numRegsPerThreadEpilogueWarp, int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, - int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, - int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool patchF2fp, - int32_t sfBlockSizeA, int32_t sfBlockSizeB, int32_t sfBlockSizeC, tg::SfLayout sfLayoutA, - tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, bool sliceK, tg::Sparsity sparsityA, - gemm::SplitK splitK, int tileK, int tileM, int tileN, gemm::TileScheduler tileScheduler, - bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, + int fallbackClusterDimX, int fallbackClusterDimY, int fallbackClusterDimZ, bool fuseUtccpWithUtcmma, + bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, + bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, + gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, gemm::MatrixLayout layoutB, int m, int mmaK, + tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, int numRegsCastAWarps, + int numRegsCopySfLdsSttm, int numRegsCopySparsityInfo, int numRegsPerThreadEpilogueWarp, + int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, + int numStagesMma, int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, + bool outputDebugTensors, bool patchF2fp, int32_t sfBlockSizeA, int32_t sfBlockSizeB, int32_t sfBlockSizeC, + tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, bool sliceK, + tg::Sparsity sparsityA, gemm::SplitK splitK, int tileK, int tileM, int tileN, gemm::TileScheduler tileScheduler, + bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, bool useFlexibleClusterDims, bool useHoistTryWaitForCustomMmaSchedule, bool useMaxTmemOverlap, bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrix, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int validM, int validN, int validK, int worldSize, @@ -127,17 +127,18 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions gemm::GemmOptions(allReduceAlgo, biasType, blockK, clcFastDrain, clusterDimX, clusterDimY, clusterDimZ, ctaSwizzleType, dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, eltwiseActType, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs, epilogueLdtmDps, epilogueLdtmBits, - epilogueTileM, epilogueTileN, fuseUtccpWithUtcmma, gridTriggerSecondaryA, gridTriggerSecondaryB, - gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA, gridWaitForPrimaryB, hoistLoadTaskInit, - hoistMmaTaskTryWaits, k, kernelTraits, layoutA, layoutB, m, mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, - numEpilogueWarps, numRegsCastAWarps, numRegsCopySfLdsSttm, numRegsCopySparsityInfo, - numRegsPerThreadEpilogueWarp, numRegsPerThreadNonEpilogueWarp, numSlicesForSplitK, numSlicesForSliceK, - numStages, numStagesMma, numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, - outputDebugTensors, patchF2fp, sfBlockSizeA, sfBlockSizeB, sfBlockSizeC, sfLayoutA, sfLayoutB, - sfLayoutC, sfReshapeFactor, sliceK, sparsityA, splitK, tileK, tileM, tileN, tileScheduler, - transposeMmaOutput, useCustomMmaSchedule, useDeepSeekFp8, useHoistTryWaitForCustomMmaSchedule, - useMaxTmemOverlap, usePerTokenSfA, usePerTokenSfB, useShuffledMatrix, useTmaStore, useTwoTmaLoadWarps, - useTwoMmaWarps, useUnrollLoop2xForMma, validM, validN, validK, worldSize), + epilogueTileM, epilogueTileN, fallbackClusterDimX, fallbackClusterDimY, fallbackClusterDimZ, + fuseUtccpWithUtcmma, gridTriggerSecondaryA, gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, + gridWaitForPrimaryA, gridWaitForPrimaryB, hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, + layoutA, layoutB, m, mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, numEpilogueWarps, numRegsCastAWarps, + numRegsCopySfLdsSttm, numRegsCopySparsityInfo, numRegsPerThreadEpilogueWarp, + numRegsPerThreadNonEpilogueWarp, numSlicesForSplitK, numSlicesForSliceK, numStages, numStagesMma, + numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, outputDebugTensors, patchF2fp, + sfBlockSizeA, sfBlockSizeB, sfBlockSizeC, sfLayoutA, sfLayoutB, sfLayoutC, sfReshapeFactor, sliceK, + sparsityA, splitK, tileK, tileM, tileN, tileScheduler, transposeMmaOutput, useCustomMmaSchedule, + useDeepSeekFp8, useFlexibleClusterDims, useHoistTryWaitForCustomMmaSchedule, useMaxTmemOverlap, + usePerTokenSfA, usePerTokenSfB, useShuffledMatrix, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, + useUnrollLoop2xForMma, validM, validN, validK, worldSize), actType, clampBeforeAct) , mBatchedM(batchedM) , mBatchedN(batchedN) @@ -310,7 +311,7 @@ inline bool checkAndUpdateBatchedGemmOptions( TLLM_CHECK_ERROR((options.mRouteSfsImpl.value() == RouteImpl::Ldgsts || options.mRouteSfsImpl.value() == RouteImpl::LdgPlusSts) && options.mRouteImpl == RouteImpl::Tma, - "RouteSfsImpl must be equal to RouteImpl, or Ldgsts/LdgPlusSts, when RouteImpl is Tma"); + "RouteSfsImpl must be equal to RouteImpl, or Ldgsts/LdgPlusSts when RouteImpl is Tma"); } else if (!options.mRouteSfsImpl.has_value()) { @@ -379,8 +380,6 @@ inline bool checkAndUpdateBatchedGemmOptions( if (doesRouteImplUseTma(options.mRouteSfsImpl.value())) { - TLLM_CHECK_ERROR(!batchM, "UTMALDG.GATHER4 only supported for batch N."); - if (tg::mmaKindIsBlockFmt(options.mMmaKind)) { int const numEltsPerSfRoute = batchM ? options.mSfBlockSizeA : options.mSfBlockSizeB; @@ -392,8 +391,9 @@ inline bool checkAndUpdateBatchedGemmOptions( if (!batchM || doesRouteImplUseNoRoute(options.mRouteImpl)) { - TLLM_CHECK_ERROR(options.mSfLayoutA == tg::SfLayout::R128c4, - "options.mSfLayoutA has to be tg::SfLayout::R128c4 when not being routed"); + bool isSupportedSfLayoutA = options.mSfLayoutA == tg::SfLayout::R128c4; + TLLM_CHECK_ERROR(isSupportedSfLayoutA, "options.mSfLayoutA has to be R128cX when not batch M or not routed", + tg::sfLayoutToString(options.mSfLayoutA)); } } @@ -422,12 +422,6 @@ inline bool checkAndUpdateBatchedGemmOptions( options.mK % options.mTileK == 0, "K must be a multiple of tileK when using Ldg based SF routing"); } - if (options.mClusterDimX > 1 && batchM && options.mRouteSfsImpl.has_value()) - { - TLLM_CHECK_ERROR(options.mRouteSfsImpl.value() != RouteImpl::Tma, - "2CTA BatchedGemm does not support routing Sf along M dimension with TMA."); - } - // Check if all elements in mBatchedM or mBatchedN are the same (uniform tokens per batch) and // set mIsUniformNumTokensPerBatch and mBatchStride. if (options.mIsUniformNumTokensPerBatch) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h index 8c921f41968..9e86b808ec0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h @@ -107,6 +107,10 @@ enum class EltwiseActType // act = relu(x0) ^ 2 // where x0 is the output of the Gemm. Relu2, + // Silu is defined as the following operation: + // act = x0 * sigmoid(x0) + // where x0 is the output of the Gemm. + Silu, }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h index ed50f012b86..0d4a19e89f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h @@ -130,18 +130,18 @@ struct GemmOptions int clusterDimY, int clusterDimZ, CtaSwizzleType ctaSwizzleType, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, EltwiseActType eltwiseActType, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps, - int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool fuseUtccpWithUtcmma, - bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, - bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, - KernelTraits kernelTraits, MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, - int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, int numRegsCastAWarps, - int numRegsCopySfLdsSttm, int numRegsCopySparsityInfo, int numRegsPerThreadEpilogueWarp, + int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, int fallbackClusterDimX, int fallbackClusterDimY, + int fallbackClusterDimZ, bool fuseUtccpWithUtcmma, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, + bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, + bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, MatrixLayout layoutA, MatrixLayout layoutB, int m, + int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, + int numRegsCastAWarps, int numRegsCopySfLdsSttm, int numRegsCopySparsityInfo, int numRegsPerThreadEpilogueWarp, int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool patchF2fp, int32_t sfBlockSizeA, int32_t sfBlockSizeB, int32_t sfBlockSizeC, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int sfReshapeFactor, bool sliceK, tg::Sparsity sparsityA, SplitK splitK, int tileK, int tileM, int tileN, TileScheduler tileScheduler, - bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, + bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, bool useFlexibleClusterDims, bool useHoistTryWaitForCustomMmaSchedule, bool useMaxTmemOverlap, bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrix, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int validM, int validN, int validK, int worldSize) @@ -167,6 +167,9 @@ struct GemmOptions , mEpilogueLdtmBits{epilogueLdtmBits} , mEpilogueTileM{epilogueTileM} , mEpilogueTileN{epilogueTileN} + , mFallbackClusterDimX{fallbackClusterDimX} + , mFallbackClusterDimY{fallbackClusterDimY} + , mFallbackClusterDimZ{fallbackClusterDimZ} , mFuseUtccpWithUtcmma{fuseUtccpWithUtcmma} , mGridTriggerSecondaryA{gridTriggerSecondaryA} , mGridTriggerSecondaryB{gridTriggerSecondaryB} @@ -218,6 +221,7 @@ struct GemmOptions , mTransposeMmaOutput{transposeMmaOutput} , mUseCustomMmaSchedule{useCustomMmaSchedule} , mUseDeepSeekFp8{useDeepSeekFp8} + , mUseFlexibleClusterDims{useFlexibleClusterDims} , mUseHoistTryWaitForCustomMmaSchedule{useHoistTryWaitForCustomMmaSchedule} , mUseMaxTmemOverlap{useMaxTmemOverlap} , mUsePerTokenSfA{usePerTokenSfA} @@ -286,6 +290,12 @@ struct GemmOptions int mEpilogueTileM{128}; // Tile size for the epilogue in N dimension. int mEpilogueTileN{32}; + // Fallback Cluster size in X dim. + int mFallbackClusterDimX{1}; + // Fallback Cluster size in Y dim. + int mFallbackClusterDimY{1}; + // Fallback Cluster size in Z dim. + int mFallbackClusterDimZ{1}; // Whether fuse UTCCP with UTC*MMA. bool mFuseUtccpWithUtcmma{false}; // Whether load task A triggers the next grid. @@ -396,6 +406,8 @@ struct GemmOptions bool mUseCustomMmaSchedule{false}; // Use DeepSeek Fp8. bool mUseDeepSeekFp8{false}; + // Use flexible cluster dims. + bool mUseFlexibleClusterDims{false}; // The purpose of hoisting trywaits is to opportunistically peek at the availability of the next // k-block. It benefits when the next k-block is already available and thus sustaining the // momentum, but it adds latency to the first k-block for smaller k-loop. @@ -502,6 +514,21 @@ inline std::string toString(CtaSwizzleType e) //////////////////////////////////////////////////////////////////////////////////////////////////// +template <> +inline std::string toString(EltwiseActType e) +{ + switch (e) + { + case EltwiseActType::None: return "None"; + case EltwiseActType::Gelu: return "Gelu"; + case EltwiseActType::Relu2: return "Relu2"; + case EltwiseActType::Silu: return "Silu"; + default: return std::to_string(static_cast(e)); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline std::string dumpOptions(GemmOptions const& options, bool dumpRuntimeParams = true) { std::stringstream ss; @@ -547,6 +574,9 @@ inline std::string dumpOptions(GemmOptions const& options, bool dumpRuntimeParam ss << "mEpilogueLdtmBits=" << options.mEpilogueLdtmBits << "," << std::endl; ss << "mEpilogueTileM=" << options.mEpilogueTileM << "," << std::endl; ss << "mEpilogueTileN=" << options.mEpilogueTileN << "," << std::endl; + ss << "mFallbackClusterDimX=" << options.mFallbackClusterDimX << "," << std::endl; + ss << "mFallbackClusterDimY=" << options.mFallbackClusterDimY << "," << std::endl; + ss << "mFallbackClusterDimZ=" << options.mFallbackClusterDimZ << "," << std::endl; ss << "mFuseUtccpWithUtcmma=" << options.mFuseUtccpWithUtcmma << "," << std::endl; ss << "mGridTriggerSecondaryA=" << options.mGridTriggerSecondaryA << "," << std::endl; ss << "mGridTriggerSecondaryB=" << options.mGridTriggerSecondaryB << "," << std::endl; @@ -624,6 +654,7 @@ inline std::string dumpOptions(GemmOptions const& options, bool dumpRuntimeParam ss << "mTransposeMmaOutput=" << options.mTransposeMmaOutput << "," << std::endl; ss << "mUseCustomMmaSchedule=" << options.mUseCustomMmaSchedule << "," << std::endl; ss << "mUseDeepSeekFp8=" << options.mUseDeepSeekFp8 << "," << std::endl; + ss << "mUseFlexibleClusterDims=" << options.mUseFlexibleClusterDims << "," << std::endl; ss << "mUseHoistTryWaitForCustomMmaSchedule=" << options.mUseHoistTryWaitForCustomMmaSchedule << "," << std::endl; ss << "mUseMaxTmemOverlap=" << options.mUseMaxTmemOverlap << "," << std::endl; ss << "mUsePerTokenSfA=" << options.mUsePerTokenSfA << "," << std::endl; @@ -1158,18 +1189,21 @@ inline bool checkAndUpdateGemmOptions( if (tg::dtypeIsBlockFmt(options.mDtypeA)) { + int sfATileK = 4; int numEltsPerSfA = options.mSfBlockSizeA; - TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, - ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); - auto const numEltsPerSfAInK = options.mK / numEltsPerSfA; - TLLM_CHECK_ERROR(numEltsPerSfAInK % 4 == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, - ") must be a multiple of 4"); + TLLM_CHECK_ERROR(options.mTileK % (sfATileK * numEltsPerSfA) == 0, "TileK (", options.mTileK, + ") must be a multiple of ", (sfATileK * numEltsPerSfA), " for numEltsPerSfA=", numEltsPerSfA, + " and SF layout ", tg::sfLayoutToString(options.mSfLayoutA)); + auto const numEltsPerSfAInK = divUp(options.mK, numEltsPerSfA); + TLLM_CHECK_ERROR(numEltsPerSfAInK % sfATileK == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, + ") must be a multiple of ", sfATileK, " for SF layout ", tg::sfLayoutToString(options.mSfLayoutA)); } if (tg::dtypeIsBlockFmt(options.mDtypeB)) { TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4 || options.mSfLayoutB == tg::SfLayout::Linear, - "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); + "Only the 128x4, 8x4 and linear SF layouts are supported for B, got ", + tg::sfLayoutToString(options.mSfLayoutB)); // TileN must be a multiple of the number of rows per SF tile. int const numSfTileRowsB = options.mSfLayoutB == tg::SfLayout::R128c4 ? 128 : 8; @@ -1301,7 +1335,7 @@ inline bool checkAndUpdateGemmOptions( if (!options.mSliceK) { - TLLM_CHECK_ERROR(options.mMmaM / options.mClusterDimX <= options.mEpilogueTileM, + TLLM_CHECK_ERROR(options.mMmaM / (options.mClusterDimX > 1 ? 2 : 1) <= options.mEpilogueTileM, "EpilogueTileM must be larger or equal than mmaM."); } else @@ -1312,7 +1346,7 @@ inline bool checkAndUpdateGemmOptions( (options.mTileN & (options.mTileN - 1)) == 0, "For Slice-K TileN is required to be a power of 2"); } - if (options.mClusterDimX == 2) + if (options.mClusterDimX >= 2) { TLLM_CHECK_ERROR(options.mMmaM == 256, "Only mmaM = 256 is supported for 2CTA UTCMMA."); TLLM_CHECK_ERROR(options.mMmaN % 16 == 0, "mmaN needs to be multiple of 16 for 2CTA UTCMMA."); @@ -1320,12 +1354,39 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mTileM % options.mEpilogueTileM == 0 && options.mTileN % options.mEpilogueTileN == 0, "TileM and TileN must be divisible by EpilogueTileM and EpilogueTileN respectively."); - TLLM_CHECK_ERROR((options.mClusterDimX == 1 || options.mClusterDimX == 2) && options.mClusterDimY == 1, - "GEMM does not support cluster in X and Y dimensions."); + TLLM_CHECK_ERROR((options.mClusterDimX == 1 || options.mClusterDimX == 2 || options.mClusterDimX == 4) + && (options.mClusterDimY == 1 || options.mClusterDimY == 2 || options.mClusterDimY == 4), + "GEMM only support cluster sizes in X and Y of 1, 2 and 4, but found ", options.mClusterDimX, " and ", + options.mClusterDimY); TLLM_CHECK_ERROR( options.mClusterDimZ == 1 || options.mNumSlicesForSplitK > 1, "Cluster DimZ is only allowed for split-k."); TLLM_CHECK_ERROR(options.mTileM <= 128, "GEMM does not support TileM > 128."); + if (options.mClusterDimY > 1) + { + TLLM_CHECK_ERROR( + options.mClusterDimX >= 2, "When mClusterDimY > 1, options.mClusterDimX has to at least be 2."); + } + + if (options.mClusterDimX > 2 || options.mClusterDimY > 1) + { + TLLM_CHECK_ERROR(options.mUseTwoTmaLoadWarps, "Wider CGA sizes requires options.mUseTwoTmaLoadWarps"); + TLLM_CHECK_ERROR(options.mClusterDimZ == 1, + "Only options.mClusterDimZ == 1 is supported when having CGA larger or equal than 2x1x1."); + } + + if (options.mUseFlexibleClusterDims) + { + TLLM_CHECK_ERROR(options.mClusterDimX >= 2 && options.mFallbackClusterDimX >= 2, + "mClusterDimX and mFallbackClusterDimX can only be 2 or 4 for now."); + TLLM_CHECK_ERROR(options.mFallbackClusterDimX > 0, "options.mFallbackClusterDimX needs to be positive"); + TLLM_CHECK_ERROR(options.mFallbackClusterDimY > 0, "options.mFallbackClusterDimY needs to be positive"); + TLLM_CHECK_ERROR(options.mClusterDimX % options.mFallbackClusterDimX == 0, + "mClusterDimX needs to be a multiple of mFallbackClusterDimX"); + TLLM_CHECK_ERROR(options.mClusterDimY % options.mFallbackClusterDimY == 0, + "mClusterDimY needs to be a multiple of mFallbackClusterDimY"); + } + // FIXME: this is a bug in DeepSeek Fp8. if (options.mUseDeepSeekFp8) { @@ -1704,6 +1765,9 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, "A and B dtype must be E4m3 for Meta Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), " dtypeB=", tg::dtypeToString(options.mDtypeB)); + TLLM_CHECK_ERROR(options.mDtypeC == tg::Dtype::Fp32 || options.mDtypeC == tg::Dtype::Bfloat16 + || options.mDtypeC == tg::Dtype::Fp16, + "Only Fp32, Bfloat16, Fp16 output dtypes are supported for Meta Fp8"); } else { @@ -1738,22 +1802,35 @@ inline bool checkAndUpdateGemmOptions( { bool const isBlockA = options.mLayoutA == MatrixLayout::BlockMajorK; - // Block K size must be 128B. - // TODO Leaving this as an option for now in case we want to expertiment with other block sizes - // As the user is not expected to set this, do not fail if updateOptions is false + int32_t const padMultiplier = (isBlockA) ? padMultiplierA : padMultiplierB; int32_t const elemSizeInBits = (isBlockA) ? tg::dtypeGetNumBits(options.mDtypeA) : tg::dtypeGetNumBits(options.mDtypeB); int32_t const elemsIn128B = 128 * 8 /* Bits in byte */ / elemSizeInBits; - if (options.mBlockK != elemsIn128B) + // Number of non-zero elements in the k dimension. + int32_t const nzTileK = options.mTileK >> static_cast(isBlockA && isSparseA); + // Number of 128B SMEM slices per tile. + int32_t const smemSlicesPerTile = padMultiplier * nzTileK / elemsIn128B; + + if (smemSlicesPerTile > 2) { - if (updateOptions) + if (options.mBlockK != elemsIn128B / padMultiplier) { - options.mBlockK = elemsIn128B; + // This is to prevent a bug when the TMA box width is truncated to 128B (after padding) + // and multiple TMA instructions are loading multiple non-contiguous slices each. + // E.g. TMA #0 loads slices (0,2), TMA #1 loads slices (1,3) + TLLM_LOG_WARNING("TileK=", options.mTileK, " with ", padMultiplier, "x padding spans across ", + smemSlicesPerTile, " 128B SMEM slices. Setting blockK to ", elemsIn128B / padMultiplier); + GEMM_UPDATE_OR_ERROR(options.mBlockK, elemsIn128B / padMultiplier); } - else + } + else + { + // The larger blockK (128B vs 64B) is generally 1-2% more performant. + if (options.mBlockK != elemsIn128B && options.mBlockK != elemsIn128B / padMultiplier) { - return false; + TLLM_LOG_WARNING("Setting blockK to ", elemsIn128B); + GEMM_UPDATE_OR_ERROR(options.mBlockK, elemsIn128B); } } @@ -1813,7 +1890,7 @@ inline bool checkAndUpdateGemmOptions( options.mAllReduceAlgo, options.mFuseUtccpWithUtcmma, options.mUseMaxTmemOverlap, options.mNumEpilogueWarps, isPersistentScheduler(options.mTileScheduler), options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, - /* useTwoCtas*/ options.mClusterDimX == 2, options.mBiasType); + /* useTwoCtas*/ options.mClusterDimX >= 2, options.mBiasType); } return true; @@ -1829,32 +1906,34 @@ inline bool getDoesScaleC(tg::Dtype dtypeC) //////////////////////////////////////////////////////////////////////////////////////////////////// -inline bool getDoesScaleAb(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8) +inline bool getDoesScaleAb(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8, bool useMetaFp8) { // Need to scale/dequantize the input A/B matrices when the input type is Fp8 or NvFp4 and // DeepSeekFp8 is not used. bool const doesScaleAb{dtypeA == tg::Dtype::E2m1 || dtypeB == tg::Dtype::E2m1 - || ((dtypeA == tg::Dtype::E4m3 || dtypeB == tg::Dtype::E4m3) && !useDeepSeekFp8)}; + || ((dtypeA == tg::Dtype::E4m3 || dtypeB == tg::Dtype::E4m3) && !useDeepSeekFp8 && !useMetaFp8)}; return doesScaleAb; } ////////////////////////////////////////////////////////////////////////////////////////////////// -inline bool getDoesScaleAct(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8, EltwiseActType eltwiseActType) +inline bool getDoesScaleAct( + tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8, bool useMetaFp8, EltwiseActType eltwiseActType) { // Only non-linear activations require separate scaleAct. bool const isLinearAct = eltwiseActType == EltwiseActType::None; - return !isLinearAct && getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8); + return !isLinearAct && getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8, useMetaFp8); } //////////////////////////////////////////////////////////////////////////////////////////////////// -inline bool getKernelDoesScaleC(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool useDeepSeekFp8) +inline bool getKernelDoesScaleC( + tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool useDeepSeekFp8, bool useMetaFp8) { // In the Gemm/BatchedGemm kernels, dequantScaleAb and quantScaleC are combined into one single // scaling factor (called scaleC). As a result, we combine the logic for getDoesScaleAb and // getDoesScaleC. - return getDoesScaleC(dtypeC) || getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8); + return getDoesScaleC(dtypeC) || getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8, useMetaFp8); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1865,8 +1944,8 @@ inline CUresult loadCubinData(CUmodule* module, Config const& config) // Trtllm links the cubin into the executable while Flashinfer loads the cubin from storage. #ifdef TLLM_GEN_EXPORT_FLASHINFER #ifdef TLLM_GEN_GEMM_CUBIN_PATH - static const std::string tllm_gen_gemm_cubin_path = std::string(TLLM_GEN_GEMM_CUBIN_PATH); - const std::string sha256 = config.mHash ? config.mHash : ""; + static std::string const tllm_gen_gemm_cubin_path = std::string(TLLM_GEN_GEMM_CUBIN_PATH); + std::string const sha256 = config.mHash ? config.mHash : ""; std::string fileName = config.mFunctionName; if (!fileName.empty()) { diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h index fd6c021e4f8..5631daf22cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h @@ -28,1051 +28,1165 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "b3c16468-dirty" +#define TLLM_GEN_COMMIT "b7b335a4-dirty" #define TLLM_GEN_EXPORT_VERSION "7.0.4.0.4.0" #ifndef EXCLUDE_SM_100 -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100F -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; #endif // EXCLUDE_SM_100F #ifndef EXCLUDE_SM_103 -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; #endif // EXCLUDE_SM_103 #ifndef EXCLUDE_SM_100 -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100F -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; #endif // EXCLUDE_SM_100F #ifndef EXCLUDE_SM_103 -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; #endif // EXCLUDE_SM_103 static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { #ifndef EXCLUDE_SM_100 -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "bc5e9a1d58573fb570bea7c17486541914a1c16aee174732b2d8d1efd07cf680", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "9b8e8af75df59539d9751042f5d4fbe9323e1c1eab402e2741ad9fa9f424ac81", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1094,6 +1208,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1145,6 +1262,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1182,7 +1300,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "0afcf4acab33fbf39eae7c8c65b99585d91fc5a1e47bed38b6173a169575bf6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "a4e5b89eef49608fdd0f8f77c481ab518a60a0d85b9fe5c6dd0fdba77fdf6e6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1204,6 +1322,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1255,6 +1376,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1292,7 +1414,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "bbd604c909f350cc92f2e364ec9419079bb2d630f5944f51b4d75f442b892891", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "5eaac110841fafa28f63c93543ac6768282fae65bc804a5427033a36ce6de383", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1314,6 +1436,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1365,6 +1490,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1402,7 +1528,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "7c911a38527162222e749808816306c2d571f119fffa6679c1e9e750807fd9a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "d712b52226313bba3467e13abbd694486b9b4102d7810d785b1902abe7aeb204", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1424,6 +1550,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1475,6 +1604,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1512,7 +1642,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "40b0c9cfb1a50d155c23f95bd55bcce2794f1b1a403e9e2ab026e4509d3319d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "399191ec0f31d00f507eca1e38186c625d78bbf45b6f70973d147e31b8d47bf5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1534,6 +1664,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1585,6 +1718,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1622,7 +1756,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "e02a8ca4d6aaa36d48255cfe348a2c67acac06a9291ac3aa9e4652a23cbadb24", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "be953b3cabcda1d98bd03882613439e271afaf9f7c80df71fbcfdb88504cfbaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1644,6 +1778,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1695,6 +1832,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1732,7 +1870,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "8bc025eae6e0365824fcae05a21f21e83336f85781c6dae34ce6428c94af5ee8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "65df3c75cc7e307f146725b30d0f33b166075fc59e8c8cdc1405aabee7e555e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1754,6 +1892,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1805,6 +1946,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1842,7 +1984,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "de06acc568c09a3e012cc865af6b9a32571de8d66df666b9aa13f0f260e41704", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "f31fe0405df81f623c9ceb71744d4a22eb924f2ec585b8f052f717dc7be99ded", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1864,6 +2006,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1915,6 +2060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1952,7 +2098,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "d23b61bca482e22dd327aeb90b7ad36d73dde4c4212eb11678a0e191ea27f824", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "2f17449dce0fd578e6746f141448500262b199c5667a69dfcaf52c16ec1f4c29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1974,6 +2120,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2025,6 +2174,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2062,7 +2212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "573eaea532e2f3256f1b6b1ae4c2c0c9610b0ee91d3745b28bd9069b7f4ae11f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "2efecb61c6bd259ffdfeb77744fda2c1ac9b5a7c85370998a88ab5f3283dec6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2084,6 +2234,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2135,6 +2288,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2172,7 +2326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "b8879f801f60fff574d5f35c886bda0902bad7194aff4625d5122521b8c7c6cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "bb47d190bad00036d032140884c7f544fc33f6dc62cf544bdae96e4c07579baa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2194,6 +2348,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2245,6 +2402,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2282,7 +2440,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "5f917a5a52770feb2c51ba34d298f0fc56aeec27aa283c2e4a4ed76f3a83e696", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "8aff7f6c0df5c78f16ba31099b01d922b699eb9f5ddcac4964654fd70e3c84a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2304,6 +2462,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2355,6 +2516,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2392,7 +2554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "92202e3d7162247b4da8e674099dba08c6a78b87051ebc665140e42d5182aa46", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "1bb0d3751f13408d28f76374b0a21e5a538906b2b7c3c45a9756334dc6ca067b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2414,6 +2576,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2465,6 +2630,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2502,7 +2668,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "3fcac34676c7b71d248910601fda119245047d7a82af61fada7ac1bcd9aa13e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "2dc54be62fbd2567dc417dbd1654f41fa7c8c6194e578b02218e286dd00a2655", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2524,6 +2690,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2575,6 +2744,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2612,7 +2782,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "4d31a20074c70ded1867377ed32c4caf6978470c9a97df242c0b541751d5a7b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "220ae493f877ddc0c378b956374f65c97362bb2b47417a4d9964ecfe1073edc6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2634,6 +2804,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2685,6 +2858,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2722,7 +2896,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "bb7b827d97ba255bbec7ad910fa0c2df9910865415c63984b6ddbcdff8885006", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "df74466e39a2290e6641033ef58b8ae9ba6773bfa9106f0e4d0b59d13813eb52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2744,6 +2918,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2795,6 +2972,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2832,7 +3010,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "5f4935a5c0227578327f85a148f6ea1ac0445e12b7b39f4827f76bf89c93bfbe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "85f56dbda3ff30379a8362d28e47e165107c62d9e7ac96101dcfad5361dd0de9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2854,6 +3032,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2905,6 +3086,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2942,7 +3124,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "864cf66a80b45eba64b825d6b01470863bee9117e2156f9af964a258a7c5890d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "a1605981b8c2318d8b3e2217b8db2eb7f9de174f4c5275b520ac399aad67595f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2964,6 +3146,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3015,6 +3200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3052,7 +3238,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "02954d2dd9b19e4d7bd1f35db4610c498907cb76041e763936b7f7ac74235bcd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "dec0fdb0263f171fb6e072ab8606e27528dd828b68542ad9b39e74d71ddac875", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3074,6 +3260,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3125,6 +3314,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3162,7 +3352,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "645a89862e81ebc255b2bfa548dfeaef58ccfffbb4d8e04732117e0ea1aa6c64", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "4061963c1ead2fb066861c537fef784e067df0b6b541a6e152d646a32f1e28b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3184,6 +3374,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3235,6 +3428,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3272,7 +3466,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "f62a0901e80121abc054ce96a3f923652295d11c7da316bd98a38ca90e6ba89a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "1e76cd99ce910c5a0e60221aebe8ff20dcec8a8c460f7d41607b27ee7c7e3463", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3294,6 +3488,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3345,6 +3542,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3382,7 +3580,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "7b976cf5ae1bdfd6707102bd06590ab0cb63041e78529b79102a9718339b17ae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "0d7adcbce4ae4d96a934650b595b39836283d3c124f3c55f8ed1e75a666283ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3404,6 +3602,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3455,6 +3656,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3492,7 +3694,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "3baf1e8b0ccdad16e1c2ce0314d098db451f0daecf1e85c3ea2e8df2bce04756", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "cc4bbc5455fd18ddbd082f1f2acc2a1f17ce9380115eec8935d96411ee61de12", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3514,6 +3716,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3565,6 +3770,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3602,7 +3808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c619d9a792e57da4604b24deec46673bf822598a54ed91eada9a62ef674e2e14", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c0ab88d91215074fa63b5ebae02058cd0e3f62d8ff1cc2248d44fb5e1f5fc1af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3624,6 +3830,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3675,6 +3884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3712,7 +3922,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "11768eb35bff09f1dcb8d619095ece09ec47c828d95835cb56c31cbe3eb66472", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "b85101f2db7fe62dd7873b41c9f67fb2c267062a6d23014ef13efd2cac6586ae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3734,6 +3944,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3785,6 +3998,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3822,7 +4036,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "0c89fe86107ca6eba47a0afed6180c1a5b80999ed990d664fc70d3665b8fc508", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c0471472b79d9a5bd8f38fe928d490cfd87776ba6974b2a7d1a619f57f4e6180", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3844,6 +4058,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3895,6 +4112,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3932,7 +4150,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "85c3ca6331e8ad60ffbfdd1f19f44590559cede5d4869d2f65df9ac1f99525f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "7515143de13232b72c35b7f39659a9aaa09999e793c2ca66849849b695556382", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3954,6 +4172,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4005,6 +4226,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4042,7 +4264,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "e719e9cf9a34ee277a9f105322fd15afc3e127c111a1319897dd2efb38020445", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "bd05c50624c407a7c978933231ca4d17b7c6ddc3536bf9c194653e9b063f5f08", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4064,6 +4286,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4115,6 +4340,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4152,7 +4378,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "0be9e673362152a8c4848b73102d4fb7c4963759a0919a9284c788b1331708c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "2f5dbd9bdedaa65dcaeaf27cc0fd6c0ddf502e7bb2edec305bd4878bb8ecccb6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4174,6 +4400,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4225,6 +4454,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4262,7 +4492,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "2da2fdab64bbd51fcf7e458a56195a07cd042f79935bea281549fe7fc3c4cbbc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c423764a32353ebfdca9b6d811a3f02c2bbabbd191e990fc9d344b3fd53cd84d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4284,6 +4514,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4335,6 +4568,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4372,7 +4606,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "ecec8ac4925b18b62cf73c3406ba1d4aeae8621fc83422957cb4fcad80ea06ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "b934aa48344c6415298748ff14853d1c40c3b38ff67980afede842fba9023801", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4394,6 +4628,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4445,6 +4682,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4482,7 +4720,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "009f34105ba77411e3890368086d62550cdf40f332df53d64434fec0296e2097", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "ba9585735dcc7e11d15bcfdc088387a34efcd70bbd678e13bd9db9c3b7341289", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4504,6 +4742,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4555,6 +4796,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4592,7 +4834,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "f9557406102a5967592b6a757548a845af5cf4f167176e14eb1b16d140a626b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "b1561ccb06319de34422c48da98864772f9720fd470adbe30a85bb5b926e4473", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4614,6 +4856,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4665,6 +4910,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4702,7 +4948,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "0365ee126d50fce8f96ff3cd4f86c2c6680be877cdeb333e3f41812883d256bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c9b41a0caa9acc9670dd8882d38f8767ec2b6e9306c019d6eb91392796b97eea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4724,6 +4970,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4775,6 +5024,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4812,7 +5062,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "7b5e7736d063a1864a3f963a4519c3657fe207eb7d4787b4afe78f4e00139ef9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "f195f2c977bd108e18651ed283d54aac3e6ff06a31a4d77e25cc0bde7b49197d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4834,6 +5084,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4885,6 +5138,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4922,7 +5176,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "93e5b5e1dd839c4d2503e203bf2e40b7ce05f412038261c1a1d5e8c0890da5e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "68e4c21fc489519c8fed4b5627c4b9289394801adbc7b7fe422598781ebcf5eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4944,6 +5198,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4995,6 +5252,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5032,7 +5290,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "869b815f75181c8cb6cb2599b73d995ab79ecba422c98f3b7b37108e8a4ebfa1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "b0db3a94ff35f23cc92a84351ea5dbe61011e0324fd7960a589c134b7a5916cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5054,6 +5312,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5105,6 +5366,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5142,7 +5404,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "45fb7c23182549f79ef881efa865a8004e1dd173257c6d06c4e42235f8cb6a1d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "d129f210d5cfff1c5a214f61793d18841bd33f9fec69533024bbe58fa4478170", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5164,6 +5426,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5215,6 +5480,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5252,7 +5518,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "a606fdbbd7462bab02b1f9ca028d7d29d8ffbb50fb3307dea13e60621a65bcd6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "70902172df67a11730e3cf632a8cc6ea27843babb70aa10cc767cfc7e1b0037d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5274,6 +5540,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5325,6 +5594,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5362,7 +5632,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "bf23e5d0903d8283cb07831272309e27d0c251c9a70bf5df36f4cfed601b2a0a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "1025c703b26a669b781b10da308ef22381ff7f1a274a8a408e1d85b6ad0754a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5384,6 +5654,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5435,6 +5708,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5474,7 +5748,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { }, gemm::SmVersion::Sm100a}, #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100F -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 211704, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "4c462072bd79230c3e63209b8bff4074d57a86d20d4213f5dc9ab5c7aac42e4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 211704, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "7a43b1b698259473d4aaaa23a7141d574a9caf3db46982a33da48cd310814301", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5496,6 +5770,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5547,6 +5824,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5584,7 +5862,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 211704, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "06fcdf41e1cd218a6e27b9b63d945d345f245b4620e96a9026f946774921f528", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 211704, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "1963ee6bd12e30cb4eacd71acc9e3e227bce8b25fdfe72d982fcba5f8dece6ba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5606,6 +5884,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5657,6 +5938,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5694,7 +5976,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "09841847b297a836668fe9bb6296d19d4e6d02245bf70e4efc4d7e667cf3f869", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d1e8cb7daf4b022164cd15aba93831506fbfe995784e7aff970e09cff33d56bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5716,6 +5998,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5767,6 +6052,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5804,7 +6090,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "456f483972ca4c8b54eeddfef052c1d5f68525767ca177653647040dd2266bf6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f313ba12828fc91923cc6fb5c7f0aab4551681d4c172356396a1d73837963f98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5826,6 +6112,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5877,6 +6166,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5914,7 +6204,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "382b539df9c37a22a45753e26da799f70b514be404e169d2256893fb92c1a81d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "bd8a8ad2582ddec2725befa44f518f35b46b1a7e1da5289a85ddc2931ea355c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5936,6 +6226,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5987,6 +6280,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6024,7 +6318,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "209ea07bb19fd3e9ee16434e166d7c8e87d0a9e590d92c5c329d7b3f486fd84c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e5f0bc6c4a11e1c950c38dccb589a3cddac97188ec2176d1e329052ec28dcebe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6046,6 +6340,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6097,6 +6394,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6134,7 +6432,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "28a1e22576cb60f6855f30e5ddcd22f29d8221fad3a8ab8f1c3c0fec9bf59f76", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "45ae59c7ab527c5c5f175bc4adad2992c87de5f8d4eb606eb47ea44f2e034c37", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6156,6 +6454,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6207,6 +6508,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6244,7 +6546,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c54bd6682625d6ead59e3e70569bba68a5343f32c8e905d27fd9fe9938170268", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "72cd5e80f666ddbaf0816a7e1b80479b23570bdcd37e49bbd8133f7fd8367d17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6266,6 +6568,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6317,6 +6622,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6354,7 +6660,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f75853bde81a2ba216b9f2957dec79ad776e5f7fa979f529e77f679f0b6f008b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "46302945003b2dc22df49a4dd2814a340b2b4a1cc0c5a15a1c08aa257e7e799e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6376,6 +6682,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6427,6 +6736,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6464,7 +6774,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f8571debf1569edea0756a3cb365ee1d823e4176d33655175f16328ee6818fc6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c9890608c86a2b9fa9f72a9a2bc0b3751ad86f9c4693df1fc452d84b6227b3c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6486,6 +6796,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6537,6 +6850,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6574,7 +6888,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len, 219800, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f", 384, "8e9d6e30d63c7310a399e2870d08f7ff6220c45518e9d1c080ad8e6312a3bb6a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len, 219800, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f", 384, "b655bbbdb07a6f610d3992225da83155cdb9242c44494d28acbfba71fae632af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6596,6 +6910,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6647,6 +6964,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 @@ -6684,7 +7002,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a34c102742981bffd86969beb1e1473f48b81c9c48e49197a4c362d231e9ac42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9c7b741d90d402975f6d58dc75808dc292cdc79f6f931fd098e92b290e58de47", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6706,6 +7024,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6757,6 +7078,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6794,7 +7116,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "bbf8ec44483e345f4b1b1701fcc9a049365dc781b1b4731256cecf40acf29aee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b344744b0ee91cad4c922fa855db736b179526ec21ff66436177618fa265cc8d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6816,6 +7138,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6867,6 +7192,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6904,7 +7230,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "905f395779cf7cd102e3f20f5ca605e826ae1da8ac98b816e6a8d9a45d583e97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3b0db8ee858e6f3b5dde27758269212d5903a48a3b68363106f7030f17fdc9d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6926,6 +7252,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6977,6 +7306,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7014,7 +7344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "81b413fba90adfa18c024da7e37e415b082aff53c10804d8dca0735123241bea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "dbab80a9f6eedae187ce4a0c2c9c3cd6eb34fef3e7e7e3f757765f27d59d9bb9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7036,6 +7366,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7087,6 +7420,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7124,7 +7458,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222968, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "82ee51c8d8f4cdd949cc92c5661ad0bbb813f6e9b89fa0d6f483a93f4b1cc31b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222968, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "070e2c48dddc5c7e06c3ef0b028d5dd6a2870995901c26af2fbdeeb15e044195", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7146,6 +7480,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7197,6 +7534,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7234,7 +7572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7e811f617211f842044486f8fcb33dc383ab33b691344577c6e36af3e56c677b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "37a38c301d6041eb278db0af21780321209084c4b9730550e0965d59037203c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7256,6 +7594,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7307,6 +7648,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7344,7 +7686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222968, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c34f5227dded240822293c6aefba1c56e113cdeeff902456ec794f98f20061d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222968, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "bca5fabaf58b8df0097c80cfb3044cc35088e8f61a4a07d6cc394dd3b3704757", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7366,6 +7708,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7417,6 +7762,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7454,7 +7800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b05532c6ff1fe3f5e49c36e77599f837f43f65eea592330f32fa99d2851acdf1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "aff87782620ab5f6b09fe3d499350e6fa9b362ba80abc45b3964f5d084747daf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7476,6 +7822,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7527,6 +7876,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7564,7 +7914,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 210584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d5b304081c4bfebb15054d13a8ab2e6365843b9637a1379abb1249e0b8cb1c8f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 210584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9a0079b6a4b67746f4e93b083ed774824e99abeea50b19168ad78009889ac13c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7586,6 +7936,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7637,6 +7990,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7674,7 +8028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 210584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "added3d481463811b30f8055b482dcb92e65059ff6a2f6fa27ea520250ed028e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 210584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f2e2bf90fad2370e5f26b740689acbb61a32da501e67dcc240e48be326d291b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7696,6 +8050,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7747,6 +8104,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7784,7 +8142,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184432, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f67c6796361acf2f26c129e6facd21b244310d5ca0b52715c0fcdcc5b8868f19", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184432, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "92aef2e5fcf61530ce75048088bef709fc9f68e2cd91de8b16992a09d9f82622", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7806,6 +8164,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7857,6 +8218,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7894,7 +8256,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a3ad5a694521b3f9455e33571a9e3ff254210d1b78eb85e57a49180311a58f7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3b286cf2a0dd2bcd4ffd35485626a92087315ba9543645abb1af9338d9a884e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7916,6 +8278,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7967,6 +8332,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8004,7 +8370,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184432, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e4191ed005d6b44d72e063bf59135bfc59bafc1be7e2c42a041a9c44dae0f9aa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184432, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "59b5dd42eaa934c7ecff176aea2acec0f5a549b9f43076422b2f229aaca3d9f2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8026,6 +8392,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8077,6 +8446,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8114,7 +8484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4f0797cbb4f470b1e8cd21c728e178c28f4f7e7fb3225f59da4d82fe72a52867", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9d934a507d9cb176086502c9abc9fc4161ceadadd25b29f828a1dbcc575ac2c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8136,6 +8506,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8187,6 +8560,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8224,7 +8598,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "ea111437a04eaaeddd626e2e33895a57005501793264f37056529f8ee2c22e23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "c21434742ae17ee1ec25d77d77e13ec2c07a5ec8db9ba2e81c527b6e8206db0b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8246,6 +8620,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8297,6 +8674,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8334,7 +8712,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "709ed8ba216b7e6b3d4a2157700b72da92337cf1c81818043232abee9b00a5c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4bab230d1bf19c6f78935d46c15d3c911e1db0c34c332d3d4923edf5ef1db49e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8356,6 +8734,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8407,6 +8788,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8444,7 +8826,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7e5547497b434fd1bc8029d34f9d7a544ceb22ec065f985fc307be4f7908b569", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "fa6d83a3fc7171727e1278bf90f8a0430c8ed74f52db1115abde9c72c35940ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8466,6 +8848,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8517,6 +8902,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8554,7 +8940,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 209640, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c007c728f3b5c2a3acb58c74dd0f0acac2aa0d3a549aa9a26a35f5a068d4d885", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 209640, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9362d314db81551adb2a323dd5874e5ce8b88a26dd9573f7e0ff016cf993bf75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8576,6 +8962,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8627,6 +9016,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8664,7 +9054,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 213736, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "40d6ebda2c2bb8f1f09005982ccb4f64174982b8d137f22ba99820cf91965f8e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 213736, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4318283ba311353a206ad351e9bc7e1eba882fc0e3e00cf8a9895086a4e4f95c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8686,6 +9076,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8737,6 +9130,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8774,7 +9168,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217832, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3dfdf51e11cadcd51fbdd8dad85f3f0c57b05ea883c18ae15c084fea3ae262e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217832, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d4b88be103ca399261c8d0d1c3ec309550f611a8b82a9404ae7c74b432e7eff0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8796,6 +9190,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8847,6 +9244,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8884,7 +9282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "23a7d3f9dac9a32c49107460e73ba330b8f69bf55b1015d79357eba928228762", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "875fdf51ea1a434552dd37fc1f4bb828c4823c08b1543269df4e5242deffbfbf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8906,6 +9304,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8957,6 +9358,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8994,7 +9396,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a149290b4594cbc1a6ffe8935e02d4baf56b51ef01216bc658eacdf030cbc912", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2b94b8207efe9f30fe8f268410c445502b9bc9d3924529aae7998841d8f41a7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9016,6 +9418,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9067,6 +9472,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9104,7 +9510,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "903d99f582c780854c0272a8c8c438a9fd411e2464f93b19ea0cfe233239fddb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d82c1a61ebe47d4f1abd1a3b159c5bcadb7b4c50d93df6cda1beca9b174b822e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9126,6 +9532,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9177,6 +9586,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9214,7 +9624,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "0803136a8f86c97c34663bbe8410b8a4d4759f4c36bd789f8a1f584c368e28f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "7457cae55bfa6fd28e69665a865650708963fb9bf209e95898ff9195c80d5308", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9236,6 +9646,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9287,6 +9700,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9324,7 +9738,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "1dbd5f0672d965b488e021bdc3b1260d834c0822b3cb738aa59d5431635fd96e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "2c0c7a0b8f3a9dbec723cf8e1efa27cec4c52acf6e3fd3b06ef98cd551c8c212", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9346,6 +9760,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9397,6 +9814,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9434,7 +9852,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 185848, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "3fd3868c93ca651676139616c8bc13255b03e75f63c809c560d724bf87200c93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 185848, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "bcacdf193acd22502f47da3c4e3078dd9ba0c544afe289d6272f12d9f7adf228", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9456,6 +9874,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9507,6 +9928,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9544,7 +9966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "495cd84b3379c587d0a0f6a908cf861670514cf58a3b1eb21479ee40a9e62091", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "40f68e76c683030fadacf8b886049462fc28fe09710c318337e383777a10db0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9566,6 +9988,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9617,6 +10042,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9654,7 +10080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "04bf66cdbebf70f3a9d2a589f14c79f28c84a0982c8114eea99613e75fb6c4d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "dc1125453ddee76f883dd0ac968e935822531a4afdfa015d46fea30ef123a08d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9676,6 +10102,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9727,6 +10156,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9764,7 +10194,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 185848, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "2fee557597fdb35b51a1c972d1230fb53714616c41272806abc7eef6cc72984d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 185848, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "d3863f7876b8f5f2251a2862903b9cd0c4d4d1a8fd108af3af2e275f5960fad0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9786,6 +10216,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9837,6 +10270,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9874,7 +10308,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 121264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "9ee8b8f337c843073ba17311534e55ef02cfa006abcc5a66739c2b745f45a0cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 121264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "1d92d451d88836268b794b1d22666dffbe3d004983882db3fd70c4a9829359cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9896,6 +10330,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9947,6 +10384,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9984,7 +10422,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "73b7c8d58c1eee631743a6b3f80e64b605af04d15e3f1f2eb08b5ab1f30b0167", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "02813c7a5a6c032f8ccbc29b630169c201e09ec57cde4b1aa01c41188ba48a74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10006,6 +10444,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10057,6 +10498,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10094,7 +10536,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 121264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "860734c4bc2b0d67b0e0627582b6791f932a1382df9a251e95bbf3e04005e989", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 121264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "709f152ad27a81092d44c2386da9b8cd8d0009febaeb1ce3600710f01dd88bdd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10116,6 +10558,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10167,6 +10612,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10204,7 +10650,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "2e3a448a8dec61e7eeb21ad26e5a7c4d1ea6108417cc86bc8a134ba80f4b8b58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "bcf0f7235168ab03f5f93613f4377dc224f8fd87db6efecd3626def16c864f6c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10226,6 +10672,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10277,6 +10726,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10314,7 +10764,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229840, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "0e44b3e85f7c97a2a4468f1669dbe64f58c9a6f4d9790f1f4ef1887d93a9f1e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229840, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "2c9bc9f4b8a9ff1645dddf4283a836899d68153c8697f1ba9fe448dacee05726", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10336,6 +10786,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10387,6 +10840,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10424,7 +10878,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "93705ec342a13dbdb8a0b34bbf11b3664c885b1913a178bc6860ac13a0088d85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "2dc185cc8ba6e34964afe2254711dc1370c93f273c493e8219ee49dfec2472cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10446,6 +10900,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10497,6 +10954,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10534,7 +10992,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229840, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "51441838147dac45da6c5e15e9fc74e3ad16cbcfab58e20b64087265e5d688a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229840, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "bb84dbfb30122f4995daf16debf77e361275ab673e7408e6f61704cd4837c477", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10556,6 +11014,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10607,6 +11068,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10644,7 +11106,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "44769852ba406f70f365ccc73577c16cc27ad571ff718f60e7015ef591225179", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "10c6a725ea9c408621a8d2952209bd1184f1d5e9cec30fb52f0d87a2f0cb8627", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10666,6 +11128,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10717,6 +11182,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10754,7 +11220,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215544, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "89d93dc9c518bbd9e89836a6f7d8de4ed840de8db5e3a8ff074d58e5742f8247", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215544, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "b77b206a4f3bf6ded291976f7c31e0bd65ebf9054fd27a8403ad266e587b5f89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10776,6 +11242,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10827,6 +11296,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10864,7 +11334,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215544, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "792ef43d508959d8793d0e5997722710b9a4398e4b9c20865dfbf8b51cc6110c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215544, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "6a36658210ca3ce32903749d31ebbd50f7dea450afe4c76f4ddb978dba876b59", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10886,6 +11356,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10937,6 +11410,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10974,7 +11448,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220632, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f", 384, "8cffc046f4e02e331cb17e206f0e954e144f8e7a0cd806ded8910f1ca572ac2c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220632, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f", 384, "22ee142006a7836d7af4d834793fe71d774bb48508ef258b069e36f97951b5d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10996,6 +11470,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11047,6 +11524,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11084,7 +11562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220632, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f", 384, "e63c04918657e51e7594ab40abdbf48b0b7dc0552a881ddf27ec8104b975172b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220632, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f", 384, "9639ce4803fbf4ba7c286a7db6be7921b001c4b3dc1c06d845b8e8a76e8f6118", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11106,6 +11584,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11157,6 +11638,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11194,7 +11676,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137648, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "7af1d4257c766f602b9e15085e7ed506d3a9051a2c42196285316f19098bf14e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137648, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "f080f7247441b9901c7bfb95207502c753f2880aafb831151735e0da4962adee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11216,6 +11698,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11267,6 +11752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11304,7 +11790,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "c2de9435ae086c8173b0d398a94bb1429455ff6aedef0aeb92c8520a04e697df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "2497d0d3793191281bdfbf647acd669c1dd3bdf573fa354d347564eedf412b85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11326,6 +11812,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11377,6 +11866,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11414,7 +11904,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137648, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "09d857159c1e53a3b3bcc13218fd239080d65ea0ecd02582722cc8ff148486d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137648, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "580ca9d9d00b141203ac0f8a56030780fbdcecfebb09dc11b787d6c433b3b5a0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11436,6 +11926,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11487,6 +11980,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11524,7 +12018,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "92189fd74803a1fbb638bcdd55196efb53fb566bde053ad6e9c7a86d2486837a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "30c7b544fe9e80ab5386016a8e1ff86a373a42e23a292a95475bdc33c0e5a4a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11546,6 +12040,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11597,6 +12094,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11634,7 +12132,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217520, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "5cb3dab2ef7e2f175bf347456446a0a05e57a047774700af6cb222ed1d983a29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217520, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "9ae15612f37b46e71cff5a5910841b1cb04bbad3f6e4754b6ac9a832390708b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11656,6 +12154,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11707,6 +12208,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11744,7 +12246,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "0ff6b702a953d89e66a9d91dcdfdbb27cea7b85b9a7950957d55cbfa6c23e69e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "267a04b39f69b08c3e33008ac4a3afe7096f4ec1f98b0f50793cbe6118c997ab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11766,6 +12268,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11817,6 +12322,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11854,7 +12360,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217520, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "a82d62b874f9f52a2b6def073658bd5e15e5a4ba24f98e7fb743c8be422dc042", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217520, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "d6d32b642894499374e868600bcd9d7f8218280784d45733f3dc6567d01124c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11876,6 +12382,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11927,6 +12436,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11964,7 +12474,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "435055cc40fb550681591584c8e3ec0b6934116a5e05f46f9539a36ac54f2266", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "3dce960a8f28e8ea852f82e45aa4edfa65f8a7f1b838420f0542d0c0e15931b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11986,6 +12496,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12037,6 +12550,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12074,7 +12588,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172208, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "68946c1276e51ee06ba1a06c5d42cf70af634d2c0a63d0f971b3c1550bdda90e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172208, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "3774a748843badfb9318cf7298457be136a3effd36d7b4dc4ee5986875f355d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12096,6 +12610,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12147,6 +12664,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12184,7 +12702,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "ca3c3adbc3e656839624ce2f124d0efff02c579ec08a372f54e0075d43305668", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "6af5b157aa7b1ca96e40e560b73b25f3b09e68b68bc95c7f62a7241e74d09201", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12206,6 +12724,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12257,6 +12778,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12294,7 +12816,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "2e907155fa74dd305d0e06f50af31a9af8a067a6757c27023f900f848e15132f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "6a96bbd9e790f9d7def6603ed7093fee9c7ffa93ca13859fe27165fefe43ce51", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12316,6 +12838,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12367,6 +12892,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12404,7 +12930,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172208, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "e9f38f13002f7c2af12fc5e3f1674de1357b7135a2fb7d89a08d992c55eb746c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172208, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "a4b91b8d36332a3d2f95ee71e568af60cf8a15a73943a08d06ce2b5da9a10576", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12426,6 +12952,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12477,6 +13006,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12514,7 +13044,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "4cc7e0be049ed6105f92224f131e5796dd7f7bcdf8c395bb82ca884c65868cf2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "21ce95a00368c18a4405e0f1e6c338e04324ec22757698a787e99e4a87811881", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12536,6 +13066,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12587,6 +13120,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12624,7 +13158,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "949fa126b88d4ab80de87cefbf3691ab3cde5ba0069454d4f56c32f20e0a9a6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "f422820c9b19c2d224fdebc84fb16a5f076065640b9eac4aba3992070e11a034", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12646,6 +13180,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12697,6 +13234,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12734,7 +13272,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "0228b8628bba8677e48228382ce73af40d5f9e476028419eb610057dd58203c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "d3e09597919a7dcc5a8e86825578a45450b31d53fcd5adbb4542b2a3c2a82e59", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12756,6 +13294,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12807,6 +13348,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12844,7 +13386,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 148240, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "163ae0618d252e6d38f9e32afb67507951ef3d2844bcf338011db07a1ee449a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 148240, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "8f273834ba9de1c84b5708cc2a3b841c7354f3f5c0969df9f851981283949f96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12866,6 +13408,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12917,6 +13462,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12954,7 +13500,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "fe4087a0328effd933219bfed6cd6e0df79f36205753ccd11be02a61a1f3d0bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "fbf6db6b83ddc1a78e46d9ea0e23b84dd38c0aaf4c7e1f2dab0aeaf97e9f709c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12976,6 +13522,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13027,6 +13576,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13064,7 +13614,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "69bc1e7c266f2084daff322af2f477e0c035bfaaee4e6f09291c53f650bfd8cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "519b8bc18d56715d82e3dda4d60fbb005052266b98d04178ec8912adaafa2107", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13086,6 +13636,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13137,6 +13690,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13174,7 +13728,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 148240, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "165db8a353f546ddbad90c989ef274e2cc9e2de94163a256952b6e05332255d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 148240, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "99d0ffd11a984bcade478ab1b04c6eb09a597de15d8d15d0bc3e10d06098ac58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13196,6 +13750,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13247,6 +13804,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13284,7 +13842,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "d3baffa228944734d49130c835d7b25f3b5f7f1b2e6c240ad2d6050b969a2c48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "f9876605e77760b82a6e1672b4d27cc8fcdd866a48e0e883e5ec2d504ddc37bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13306,6 +13864,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13357,6 +13918,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13394,7 +13956,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "f3c2524182bc50078e5357eb71731707bf1dd30efe8b69a6c7cd4b95657092f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "2f8feb889334608650b16b2767df326b02ee1aa8f06e12354d5c920b3f06b998", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13416,6 +13978,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13467,6 +14032,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13504,7 +14070,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "2da4c64a5fbea5871ee39877165fc6a6cf22de6d42c58d8984cd19b8c6b75247", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "c53a5ecc7e1a53783cdb5b6b0f980b31b4bb551c3e6126c569cf0938300d72e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13526,6 +14092,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13577,6 +14146,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13614,7 +14184,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "645a0c5eda479b4db08221e6b0c0567ac8b457f05a402fd27da858c344236519", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "64d988efdc5a21e5aa409014d9d368e8873dc851b02a50cdcbfba32e831c8152", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13636,6 +14206,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13687,6 +14260,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13724,7 +14298,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "0d9fd887a12f988fae46ffd789cbb6094d2a1121e26f99ebb29365ee341caf5c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "9fc7b945ad3d52ff6f026928e80d03e1773bfa31a0e5dfeaa0f26791afb90725", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13746,6 +14320,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13797,6 +14374,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13834,7 +14412,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "6d632e3c91737e5e0d7a1f5b090d768be5186acf25403318067194c862df4056", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "4b9e65a29bb4a9dd96343279edf60708111381e1ae269d1e82159921e6199f03", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13856,6 +14434,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13907,6 +14488,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13944,7 +14526,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "41d540b9837a53ce267412d51d87690883c589412c68952aa31b011983de3c15", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "c277a0a577ecda2fede1ebe700699e66959b8bf34550cb346fb780b9f3f234d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13966,6 +14548,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14017,6 +14602,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14054,7 +14640,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "8028f7044315d9db113e9e25e12ab8830fbb4cf6bfb6230741fad9510aca16f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "0f44efa1339f57480bfba5e9e01c95bc5f5bab340fbc643381bfbddcb11f6fb0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14076,6 +14662,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14127,6 +14716,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14164,7 +14754,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "e697e45a7aded5d95052ef7121aa225c1fd773499cc455ab4966a5de6085b217", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "e33e84156301e1232b0b9ab850e44cc87201b98d906ce0f5bfe1faf86c16550c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14186,6 +14776,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14237,6 +14830,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14274,7 +14868,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73296, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "20aab5a9d324b7f5c3c1f61786a24ef2e0ccbd611aff4793edb5ef0528af2afc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73296, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "23e6879f5682da3e1f59ae8b00746bd7ec41417245d0b951b5a8a77c09524174", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14296,6 +14890,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14347,6 +14944,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14384,7 +14982,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "5c4cd11d88e82457667572a82bdca59062d9bc81e2c051706a8927bf55a3cc50", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a5eb733a3aab4d139a26faaab0eb2e845695f9c005e563b5623a674096d0f680", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14406,6 +15004,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14457,6 +15058,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14494,7 +15096,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73296, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "6d7efe60853d043cdf209a0de77466425dbe4ba76f1efec78df4698f30ee7f2e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73296, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "19daac3f0534632794d5e92e45d90b2ce189a65b6696cb427f51ab8ecab2f8e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14516,6 +15118,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14567,6 +15172,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14604,7 +15210,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a7c37f21e95f5cbeee57c59eba577bc941bb82329d4ca5e3de91b8c9251e78af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "219f31e6f3aa0f75f8f51de901b41c4957a79c53bf2ba318d426fd6075887cc5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14626,6 +15232,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14677,6 +15286,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14714,7 +15324,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "96e55f596dab7810031cd52480bc7f0b47a1fffcff99238d73bddf66419bf3d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8110128fefc020ff91e0a4544c9521aad059b002f2d6ddf3e5466c9f8b02e26a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14736,6 +15346,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14787,6 +15400,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14824,7 +15438,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "ca2221abfad0b20ec3249df74698c4b9aaec2cef30f4dcf9a95b872c2513f00c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f478b64fb7a249d5bc0172d722ed4e09bd8a7fa95568b15b1f0defec3b6b01b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14846,6 +15460,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14897,6 +15514,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14934,7 +15552,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c4fa06c68d1c0d08f359847ed23c3279c83cb8d07f0b0aaedfbe677fdcf1bce7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f4462ea202b088122cabe88cf0540877e047828572d6859eb0ffb5f89ec3e9fa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14956,6 +15574,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15007,6 +15628,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15044,7 +15666,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "957b078f4bcb0789a8e2b16aa5464b8f70b00b3daf361174f942ea32f851e0f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "962284c3315c8612cc29fec7dcc0866b9c643b4b7905c2df92847f1a3c467c4a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15066,6 +15688,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15117,6 +15742,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15154,7 +15780,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7f4ff0e696db70beef97ecf7a1ba77a2370d13dc973cb6e683435abdaf78d8ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "47b2d58c75b1ae744a8f706a2f82d35762079cdd920fa68d9f79bd06add94bba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15176,6 +15802,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15227,6 +15856,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15264,7 +15894,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e3ae72bc5d77684de83a4d7de04f0052320b71db6b651da0f389e19de0052e88", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6094bfa636eec622c6027a7e47c5b60f3f5242ce0fb432dff5007868e5d44e59", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15286,6 +15916,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15337,6 +15970,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15374,7 +16008,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "71497e02845979048b72e03619e26db05f47e942f486d31844c126e40d2ce9fe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a35a768b6f7edadd07a5c13e455ebb15ad5cd4f80558244fcde6de08e52b2474", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15396,6 +16030,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15447,6 +16084,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15484,7 +16122,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "14de1f999958a579ef2a0c09a75582fca9af0e5a0d94682a2ad680fde28753c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "e4c41de71568ba09493c833635821599c97533af2784675aab5d51f8d3632d8a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15506,6 +16144,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15557,6 +16198,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15594,7 +16236,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97872, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "4ed3f82f32d486791cdfad96c790c3f7e386962d9df33f4e29e4be43316e90ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97872, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "d612f542487d0a5a44e3adb43b91443235544c4e4ef08c858a44d61cbd9209cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15616,6 +16258,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15667,6 +16312,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15704,7 +16350,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c0e3bb9e807761267266e52d7b29618b49c1222abdced8a2167f6dca0b94f45d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "d412cb418d5b77674a0506490819f225e336c74311f4f31bcf8ffb416c56e1d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15726,6 +16372,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15777,6 +16426,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15814,7 +16464,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97872, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f34cb831ebbd0e771c975d5d12463990a2aa3f43e96a99b3c6a341d4584df7a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97872, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f34ed1401ace3ea024983cf2d236319794f7c5ffb0df98f163fe723eac68d3ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15836,6 +16486,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15887,6 +16540,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15924,7 +16578,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "78fe4b003a2c08564321b19a46c241ecd4575906bb0644bf8fd2f54fba59b0cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7683e58ef9bd4bbf989c604df802673ca5dfad80c80f50bca05acfa8fed1cae5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15946,6 +16600,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15997,6 +16654,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16034,7 +16692,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "82f0e36d35614a603908baa85a3b59a0509c7560e7152ff553ea5bddc74aaf4c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "ab3762e4eecb847752142ea69e200fbae45fd8ffee64bf2a64c6c412a5e40f4e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16056,6 +16714,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16107,6 +16768,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16144,7 +16806,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "16282aaeb68078492025a3585fcd24fb72e9ce5821604dfe201ababa49153659", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6c1b89b2724b6947029074137da3fb66d3a894759308802dda011b7ba620da54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16166,6 +16828,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16217,6 +16882,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16254,7 +16920,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e10c0224b9ae40f181b03646354274acca5e338bff1d4c874036ffaff616c628", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "acecc3720906a3c99b6883656c2710a43ab67c9d2c7f4f789c1059fc1e95ebf3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16276,6 +16942,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16327,6 +16996,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16364,7 +17034,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "594d233181fcf06bffd60377976c0fb216ce18ccf23e807c7fd2bc1246474ae5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "b7e5d825bf16d396b1e26641f71cfb3626f303d2de193a24f2705816f1c4199d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16386,6 +17056,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16437,6 +17110,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16474,7 +17148,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7fe5e317140b4abf908d57c45dbdc7bf64062b7413404ee70f3f45545dc8013d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "03008dd9e3361e476a10ba22f53e745da5c0f33acf905c36df2011cb0e27d44d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16496,6 +17170,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16547,6 +17224,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16584,7 +17262,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "108e5fa414e2edf1d018c4223fac21f4dee2293a30fb9341d0eae3b262e6aa90", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "ad0e3e79fd6ede2187e363740f065cefccfbd83c100b8c984b9e4d5bade6db06", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16606,6 +17284,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16657,6 +17338,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16694,7 +17376,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "dab9488d409915f0a0e9f41f5ce1f921cd6213d09fd11405c91799a0d6be246f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b51065ec78b6da570e8f7342f4c8090de16dd94e923d90911ca4d56ff8667fa1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16716,6 +17398,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16767,6 +17452,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16804,7 +17490,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "95a36353da567e414f8d58b16775c22b6581154db21b43532f61cca622d7768e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "06a68fca6141ab75f3a832b445b6e73a2621d2cdc6c1584e7f3f05406ddf312a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16826,6 +17512,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16877,6 +17566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16914,7 +17604,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 148048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "276e6544b5a5e789b17fe270f0412e0e47d484a400c294bd1c593a46d20031d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 148048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8d5db6d22e534c02a9826d14e2644fb8de8b7dc2d257a8903217376e4696311f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16936,6 +17626,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16987,6 +17680,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17024,7 +17718,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c7d9ba96bdf692e01310aa34cf61c9beba3e505d59772add5988e0ffbd46738d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "0ecc0f2af9c37b4dd53d4e2088cde06a712866bb190a83003640667c200eeec1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17046,6 +17740,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17097,6 +17794,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17134,7 +17832,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 148048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c32adc6ad64bf73948a666fe560bb2eaa210eaa898132428a7be923fd165d1c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 148048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "886c05d53173b4bc1ec4711f51ce3f4272f37c495f246606011622ecfda732c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17156,6 +17854,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17207,6 +17908,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17244,7 +17946,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "dda1f4dd9aa0db9c015290540d378be934f3108c983f3c90474bd045977530a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "db1105607066af2acc7bc6f1a44236b2f8273ef0cfb576f5f7683e81a5770318", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17266,6 +17968,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17317,6 +18022,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17354,7 +18060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f3a4db69b502a9ff57b8d007cd0ccd2df738d8bdfea027b5557cffd583a23d52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "bea95a5ca490eb16c159b1212ded860e451a518f36d2f6c0142351bd7bd460b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17376,6 +18082,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17427,6 +18136,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17464,7 +18174,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f3133904382bf6861c8fd49caf898b5d3bf4ebced22dd6467e0571c5c9081b15", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4bd3fa3be9ccd6d292e11db708a84567d090a36e712f0f86b8ebddf7d9120878", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17486,6 +18196,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17537,6 +18250,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17574,7 +18288,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2a7555f2a67bd824fac43461ad3c31c9466f45fbf0cd89b8321d331893934847", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e998ed66b7319f512c001370d7d4e8aee1637e5b4d4770ce11016ca8ea6034f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17596,6 +18310,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17647,6 +18364,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17684,7 +18402,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "cb386773aa2c3faa73a5a14fa95da6c73ad13feccbf7d472ce0173d20eba1db0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "37b6fefd7c5520d9e42717b28189fae249a80572bd7786c4a798b560068054e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17706,6 +18424,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17757,6 +18478,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17794,7 +18516,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "222dec48978238606f92b0484eb0e183675f4dea271ee85e8624734d03958a7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2d137e1b53eae5f55fbb6f85e43228ae8121999e27221b7a30c9c6f0a88c537d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17816,6 +18538,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17867,6 +18592,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17904,7 +18630,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "82716bffa0f69e7b6e7b6d89a5789c2a0d8a75334371af664b48eb06373d1659", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6cb281661d9792f6f1a789d5611dc8ddbccc94513cb78b2de2484e5f4040878e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17926,6 +18652,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17977,6 +18706,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18014,7 +18744,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8317e5b5951fbab83d1dceb7440d9d64150f745abd4d6baa5170c9f14d6ecb48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "10a3b555a8cdda63a4e4a5c4f53cc210e5f662e2acd2004b96df6ed74873df62", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18036,6 +18766,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18087,6 +18820,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18124,7 +18858,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "cb349d0e2071c1ed6275d76a557ba8e745852a3b9f86be9b6c6d372a7567992b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "1fabbe9251d7e97636322670bb9d6892c034c7a7ef29c068aa170084c65140a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18146,6 +18880,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18197,6 +18934,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18234,7 +18972,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 61008, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f471163d75dabe7d07acd2d07ad9a0a502b1d37caaae0200d723512becf9c1cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 61008, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8d2e75d8cd37b0b41727b75722218e5004c29bb77141293a547f333ded7107af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18256,6 +18994,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18307,6 +19048,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18344,7 +19086,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "330af66a59b02a7d6c80d13933860f8564479075408e20d7366d115452b3568c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e796d504151b5150e4bca77b198fe914d1368bbe6780b4cae95dcaf23d0c63c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18366,6 +19108,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18417,6 +19162,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18454,7 +19200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 61008, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "1890c9310a40ee337a404455acb8665ea7907195fb7a6eacd76ae62e104b5f53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 61008, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "25fb2f38d1520631d1ca1c284092f6d34705a841c7e6955f17708503f8df50b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18476,6 +19222,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18527,6 +19276,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18564,7 +19314,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b8bfc30fd671f90b206592495445aab722cfdcf9ca50fbdb827368dc568ae518", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "518e5d05b921772c198926e1383e98c2ab8dafa4b0173b073526573312ea3492", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18586,6 +19336,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18637,6 +19390,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18674,7 +19428,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "192a49bc69857bf20f9bf966c13605cb2babc404859c9b489580a5d569a37555", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "0f830486c564762184309c0d36a8e36cf6c12c8f7384810696dc01cbdf40ffbd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18696,6 +19450,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18747,6 +19504,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18784,7 +19542,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "353fc23a5f9a238892248e56206ebdee8b8cbc204ee5fd40c1fa2ec4c99e24b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f9adf6b437b59d8a3347f707e60adbbbc2e3475aef70980f09d20d09051f1cbd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18806,6 +19564,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18857,6 +19618,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18894,7 +19656,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2d2bc053782651f9badca5e4ecb595fdf3ddfe7b23dc1f7db7d0048743a45924", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8b10910ec7ceba451d2d567f913d71110863abfc3b5d53ec7610ee25e6334bf9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18916,6 +19678,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18967,6 +19732,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19004,7 +19770,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "d0c20c11ac8368169a67070dead47052cc7eadf991d991adfe65414cf7360bc1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "02902c53f733cf9f6c9b9e1f988afe693c3dfd5327d32e481f192f2bde2f5732", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19026,6 +19792,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19077,6 +19846,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19114,7 +19884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b8f8f31372f7be6c7c7c2e49fd56ee1f24ab5a12ace2b76f876438328449486d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7a47b13c2597c668aee7cebb2a10d2ec5c49f056c770f026e061b9b431a2cf1c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19136,6 +19906,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19187,6 +19960,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19224,7 +19998,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2993b344ff566a137c5461d0ba0c1f7d57aae041c47be01290f24bb16e234fdc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b0cc1927c836c87c79320d23d0b0baaa3c50b9592901c7247efd1a2995944bbc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19246,6 +20020,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19297,6 +20074,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19334,7 +20112,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "6c11434a040a6bc91cb9fac17dff32205ac1a3a18ad06d6fd2e28356ef759036", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "fb4e3364d9067b8b53ad119c336b14cabd6e457318868e6b1698dd647b72268d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19356,6 +20134,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19407,6 +20188,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19444,7 +20226,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "14ea1e97ec537058c447beacad93ac0d9bbf65e1d205322766361a74a9312869", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "8975f425328437a0080ec032d28652100bae2cda3b4490ed6727f593a3d4ebcd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19466,6 +20248,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19517,6 +20302,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19554,7 +20340,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199248, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "ed4e7e1a1b116d287fa9207ff252234c91c81f33324cc58dc8c3f3612c948ec4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199248, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "f65c15548e7898d023d2de9ad160e0b9a84da58037a22dafa4cb1cecd04017eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19576,6 +20362,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19627,6 +20416,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19664,7 +20454,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "77300db9a68a9aec3613cfd3d4961aa7dda60d7563e2b8a262fba50cf8aa5e0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "384bc216ac79bb982c9817dc262d24363c37f6fb5383a08683c4efd61a137d9d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19686,6 +20476,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19737,6 +20530,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19774,7 +20568,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199248, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "ffcc063d92df8828b993b3392859b0326302360b01b7ba33ec2be2c4b8d26e38", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199248, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "e8ca0def94839d7006d11c0e23250364d0f9bb0ba762fcb96c94a46972a3ab2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19796,6 +20590,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19847,6 +20644,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19884,7 +20682,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "2e4ecd933d833b9d8e6d61fba2c83ff27f922df33d90cf0a094775c8731607e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "0135f87b025375ec0de6db6a23afc80f8a153e9cc8158b02564cf5727c1218b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19906,6 +20704,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19957,6 +20758,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19994,7 +20796,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223824, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "a4cc4366fe59ca1232e3ffb04b7dc207c4bdcf71b6302c73e893f72279b5c614", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223824, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "a1317c4898243ea77eb01f250acb3e0a877a1a506f89dada6e5f3ad373c55179", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20016,6 +20818,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20067,6 +20872,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20104,7 +20910,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "2f8e20fc285c12a0458f88de9ae470958171d2fc1de390b12d9a5ad6adc3608c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "d2dc4226d392905808a5a20f8b90015448c8effe6e41baa13a0ba0b48210f8e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20126,6 +20932,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20177,6 +20986,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20214,7 +21024,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223824, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "34a3922bcdf8a0cf7811e2454d17e5e481289e74cd812d9e42ad2c16a19cb2a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223824, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "2eb2b2175b8fe29ff14020560dfa9b8dff64650098eea4ddc913da86154e260b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20236,6 +21046,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20287,6 +21100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20324,7 +21138,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "77d61a3b2e987481ab08800db6d21257e01323930493edc5a93496a7a720c879", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "02bfd93040a63db63e6be75c2b6e14debbca7b03642c56f06f00a806f9dc8476", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20346,6 +21160,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20397,6 +21214,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20434,7 +21252,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222736, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "2b6f1d74d90c2a18fd203873cfb0af03b0a6a3bbde0f739f021cdd1efdd4c5f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222736, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "699837aafa51ae11803fdece15c5e6d7210f21c331cb274ee87244a88e2db28d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20456,6 +21274,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20507,6 +21328,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20544,7 +21366,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "577d1792d953e888813c2bfd4f8f3f39678a915835ef5598d02bf281dd57a336", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "fb06ae65cce73e41c4b7fe3f415acbf13710191249da0cbf478478d64e194b68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20566,6 +21388,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20617,6 +21442,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20654,7 +21480,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222736, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "19e4833d97258b5623a723003ccc67409990a521f8c426b865cfc9289750932f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222736, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "414cd1dccd06d196c446b348de7c0dfb052eef99dd73619f22489b7d4dcf80bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20676,6 +21502,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20727,6 +21556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20764,7 +21594,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "d674836603cbfe454fa4237b969e42e93633aae2e767f62280cc25dc505f1cbf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "7d69b625d2c2ad15d5ac928deb6704ca2706c230824928fe5f994b67f609132a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20786,6 +21616,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20837,6 +21670,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20874,7 +21708,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186960, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "15e4de3011dcef79a8bfbaf0604dc4792df9720eeef74161154f4471dad18309", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186960, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c4d99e7bd5a3e7f7ee66915a9898a28c9e5b6ddd5887b3078f5ba8eb0bf911af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20896,6 +21730,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20947,6 +21784,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20984,7 +21822,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "6bb46252d0a738e1171369f39bd96d0493191fa48a248ad00fb9231a20106bda", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c7c445c453df9363e3eca88414aa7af2b74409001ded953b99447d4f2d49d217", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21006,6 +21844,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21057,6 +21898,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21094,7 +21936,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186960, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c66191e5e5fb15837ab9a38d8663723c186b84c0e6f8805ab4f9f44f9aadcf0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186960, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "4d1919a8c7877d5da4c731472be259ea48b3b6750a5c65f5bf379e9b51f43e79", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21116,6 +21958,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21167,6 +22012,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21204,7 +22050,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c5650b0406947e00d4f445e86b253cd04a5bb3e59d1397c51b353843a7742f63", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "60fc99c0936fc790b84070e8d521928f14c4dff48cf039c033915ac6a8b30e00", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21226,6 +22072,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21277,6 +22126,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21314,7 +22164,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222672, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "fd454f7bbb5c5a64003e719532c04482f9be185402404f0d5061ab0caf746cd3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222672, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "44ceaa574d3ca14b509f703a6a868914a63cbfdb92d1052d8764929335a35eba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21336,6 +22186,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21387,6 +22240,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21424,7 +22278,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "9b14764f69ba26d3131829a62a0bbb742f9c7fc0a960dab57bdceb187c7d211e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "1d77797d39ce0765e22a9cf1cb5dd24c488f809948cd57f3f296b4bf6d7b191f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21446,6 +22300,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21497,6 +22354,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21534,7 +22392,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222672, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "290a695a36aa8344c472f84d20aef2c34d737b8abf05ddb7067a9fc0ca930293", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222672, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "7522ff7575e025387c7c503f1f96c46d2645cc40b35bfa3deb79079582722337", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21556,6 +22414,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21607,6 +22468,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21644,7 +22506,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "7741ee5f0ce585b153997b62f0f2506010fe51b815b6da8156d081a8151ab575", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "40c353aca457bb2287792234ac71413b35142cafaf2603c946b3f07c2c9265bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21666,6 +22528,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21717,6 +22582,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21754,7 +22620,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203592, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "a6265f811277cf7e711f169e1da2eed48bef4120c7827a959e53d04fa049863f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203592, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "54350618f0faae5c3d22b9e446d179caf24280d49d9e4717befd4af63ad4275d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21776,6 +22642,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21827,6 +22696,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21864,7 +22734,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203592, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "442bd4b754327c7a1f83a7a08bf941f5bb27dd60072faa233d30d6fc77e6ca97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203592, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "e04625e0c34e89a31bd8a802be005b73bbfd2360887e1227ceb81c0285d20a24", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21886,6 +22756,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21937,6 +22810,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21974,7 +22848,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 227928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c3d4df8af897767383e196f049f1cbab688869975aa1cd9b19ca3e6844dbd96f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 227928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "478234f94a0daf5725bafd5fb8fecb3c8f5a0fa34a1a6b7d2bbcbf208228ae45", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21996,6 +22870,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22047,6 +22924,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22084,7 +22962,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 227928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "a77253d4af0491a8fbf3d3145d1f53fdd7907e504f7f67948718ad52ab65c64b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 227928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "9caad0b8bafdd1e46796157ee49136f30517c84bc20f4c1bc43de61b1bbc99d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22106,6 +22984,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22157,6 +23038,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22194,7 +23076,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199408, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "57094c6c7978ef63a4f8f3c1dab5ab1a3f07c237f534059a9ecfbd5bbb021856", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199408, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "579711409a3a3ce06432292e2b778cd9fd4011bc6a657d5b4d8107769e19a82e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22216,6 +23098,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22267,6 +23152,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22304,7 +23190,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "ec5875dd09de3e3dd32f32d8aefef4b4385f847a1769ddfeda3b4c97a0e6696f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a685f96d682362041b262e8aa22eeeba486d4f716a991ce3bd8064d2e267acae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22326,6 +23212,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22377,6 +23266,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22414,7 +23304,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225112, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "22121c621f005c4766040eae5dd5643e8b09e770a756f2626ade1f9486ff82da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225112, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6066e188cbf21fde0f4e3e121357b5ca729b687c80b67169a6c713ef52cc166c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22436,6 +23326,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22487,6 +23380,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22524,7 +23418,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2bc2308228ab5f6365027f0545c3e15066fa3771970b60fb2e1fc7a3775c5ea8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "acf8de8072dceee75ad043c7536aa12b45734837566432e1036e1bed0eb14f2a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22546,6 +23440,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22597,6 +23494,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22634,7 +23532,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199408, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "cc402d696239c853f29b251292aba3116b15cd410a780cffe08ff8d8933ce9ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199408, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "958f3f3739df3b1034fbf33b2ccfcc6621418ac37142eb7c2f2037211f4c23a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22656,6 +23554,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22707,6 +23608,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22744,7 +23646,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b4b7c7f326d4c25f577f158e29cdbef27d4570c394279288893dc4e438289ebc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "31412a05bd4543da02e2ebccfa778654c4edfc65fca452376814efdb6f86d20a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22766,6 +23668,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22817,6 +23722,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22854,7 +23760,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225112, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9be6f96f443f094f55f2627cc69fd33caf6be5c29583e9161618244e8879e838", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225112, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6752830f0a41bcc3905eb567a30a3741d8ae6833eaf17677472eb502d19ba514", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22876,6 +23782,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22927,6 +23836,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22964,7 +23874,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "93c83ea56921432815eb08ae65be6552b10649ed48ce101f1521062e100dcdcd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d03c306279a89eac2ed5537db56b6e5e8acbe72d708ad6859e42fe03f38f2b10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22986,6 +23896,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23037,6 +23950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23074,7 +23988,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len, 197272, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f", 384, "69880f0c911f857f11b7ae46295f335524c732f5d0284b3033b6e7556035f0ac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len, 197272, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f", 384, "00567f222ab71fa2c15d8a43308ee7b892e3be84dadf2ae60b78bedbe98e85d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23096,6 +24010,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23147,6 +24064,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 @@ -23184,7 +24102,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "659189d6f0db1de4e1b114b5c5ce7a774551e3e76ee89976a764bd62260addae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f9bcd9d11377483ae915075decfacd936e41c68fd1dcbd7dd7a6f999b8f5858f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23206,6 +24124,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23257,6 +24178,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23294,7 +24216,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "50fdd4c86c5984c52a5c17c5891ae4c2b5ff1ceb1862a6bc4b920242002f883c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f01afffea85ec098133c806cf7117218ea3eb8914911e80eaf39b4733a9e12ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23316,6 +24238,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23367,6 +24292,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23404,7 +24330,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204536, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "1bb31f80019369a73865a3049a89c513d17cd9d031a6ed5ee92cd9cd31322c65", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204536, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f203d37d346b80214c543f29f72340d9d8caff2c0b3dfa7021033018e49c7a41", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23426,6 +24352,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23477,6 +24406,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23514,7 +24444,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "fb801349440c41e798cd3c432460ed9fed52ee679e9beaf7dcb00a23f321085f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b846217462c09c953775c1505af225b71d793fc7f0e9bdd0e9ff45f96f482c32", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23536,6 +24466,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23587,6 +24520,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23624,7 +24558,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "be887dc0c11ba2bbbf3eb1a3f0a57868085b035decb26b29421c6fe7ba52f64b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4a295791e5a8d5d3c396a4ce870096b3b0ee7c8b89cdefb763d06e704fa01444", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23646,6 +24580,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23697,6 +24634,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23734,7 +24672,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6386da50cd0030abdba6db42bb10015433b95e0c8d8968fb4012d4201b3e5b0f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "ab906008c1cedc720b812d118a766589ed51a75541b05c405890e26aa6ac138d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23756,6 +24694,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23807,6 +24748,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23844,7 +24786,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204536, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "0fe5360c8d7d616250743260297eb65d76b064e2b3c8db13b5295bdc4d337f61", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204536, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f9d2fe6f8dd4492770bafdf3875d841d2f53626827a5638ab279fd6193758e79", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23866,6 +24808,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23917,6 +24862,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23954,7 +24900,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "5af572e15e96313129cbdb568cbb69d6e2e152c6d2dbbdf9167fffcaaf8cdcd6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "220d21a909b8f5b58b7b1b2816bba44e309c3f3e25477a47a98a2df51861a17a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23976,6 +24922,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24027,6 +24976,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24064,7 +25014,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171960, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c909e07a2bfe6b1804de36905d8c2d0c76dd9acd243e319d0b38e8cbd85d2bc7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171960, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "0f6126d30bb398c5f07bb085bd6ae79694d3b7e3a2136a8ab9e0dd891f26b7c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24086,6 +25036,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24137,6 +25090,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24174,7 +25128,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171960, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c54adc6836f9c645980e10b73e0356af6e2e6e920c8c189ad41cd9c23cf3c49e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171960, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2cdc9fbc165bebffebaa0194ecc97c2d78d8ebf4a8523d0cc4e45833b313987d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24196,6 +25150,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24247,6 +25204,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24284,7 +25242,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 192152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "74d1860fad2300afe8731db7afdb95c852e4ea79ab21ee37fefe46fe1b72fb96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 192152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f121489ee66e414cc3641901c975fb8c096ed8e97e1bd3060fc27848e91f08c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24306,6 +25264,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24357,6 +25318,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24394,7 +25356,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 192152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3ac762d9a70b81abf4395f3091dc801d42dc75c1c0060870c6684a4f8b8636e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 192152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d608fb3ab512d1527cbca946e51503e36002d515daf5b7bfb71690f84ad54e66", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24416,6 +25378,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24467,6 +25432,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24504,7 +25470,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223056, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7b9db4adb37890b89ba1639ec22e6df51fdb43915381e15bd79e1475dc3c2f53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223056, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "17d3d212e31ffc56f5d94675aa8769c07c068acfb6e9f0a1cc4fb53761fd37a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24526,6 +25492,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24577,6 +25546,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24614,7 +25584,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "94937503dab460da4a098b93dbc0c7054b2df737aeafb7aea75fb0b2696d1608", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "5d0d39a0f6f590e24ac6cb8f170231fe7da2d53bc6494dc5a7535a5bb0a83ae0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24636,6 +25606,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24687,6 +25660,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24724,7 +25698,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229208, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7d6e5a4ed5c563e5a921019f2234571356b4a19ab6d5b60b35d42aaafedbbaaa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229208, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e47e3ac56c568a4dff51544305fb7710c44d722eaaa7613fae555644db8e1076", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24746,6 +25720,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24797,6 +25774,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24834,7 +25812,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b877bb92c361eff2a5c2b9182b888a849690730bbe1698edb8946f494ac86dff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "89a46e501afbfaff7c8f346cf84f237917eaf40e212e0c7a7a07b3858e3f6aff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24856,6 +25834,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24907,6 +25888,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24944,7 +25926,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223056, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "134c570b9a804543b350e45657716e1156b2747fbb6de2506b4cc642e37e34f0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223056, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7bd67d4fe727e8deccafb7b7090459e06945d803f27d974f91dd7d49f5148685", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24966,6 +25948,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25017,6 +26002,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25054,7 +26040,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "8a070bdc6b463926897b3bf6960eeb4d63f6860b1ab4007ff2f8492313b962bb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3f5a4c0d60200ef54b652717ae5dfe76592fd0dc803aa0576cd96f5e1ab09f6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25076,6 +26062,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25127,6 +26116,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25164,7 +26154,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229208, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "bde30a55f99ddbc1cdcf0a73be0a7dd016c49a20e51b31e605758ab1326cb049", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229208, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "0e3be39fb462d648561680a9c789c3c234bd1f780eae7166df3d037373f4bfe7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25186,6 +26176,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25237,6 +26230,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25274,7 +26268,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "af241a4e0cf5f2a85e03bd1ed2b8d72798f12b388a035082babdad65bc33167b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b6d6800ca5931d6f35a7bc573bf817645ef655476c45ac29b6f8b3d60f964eed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25296,6 +26290,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25347,6 +26344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25384,7 +26382,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f400cd5784830fd1c99ac76b920970cfde640c8d2cdb495cfeab2072cd6baa0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2cdaa14c0e4745e0a3a7a6ebed48c6b54049cdf6c5b22302703e6a4f60e34d2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25406,6 +26404,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25457,6 +26458,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25494,7 +26496,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "225dd7ad71b9504dd0687cd4d8bb61fdc357903b43abc90382281f9f9f3aaaaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6f1023733b81ca69a279d1648a19bdc842a9f6e604151c9a2293db23865cbfaa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25516,6 +26518,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25567,6 +26572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25604,7 +26610,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 228920, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "99a1871208c492c86af39d1e9e529b6b9748ab76e84af5d99f98acd8d7ec25f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 228920, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b6955ccdbaa1a1d00e0e3a16f4d91319959be1d2cd2df73947f9779e8bfa60fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25626,6 +26632,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25677,6 +26686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25714,7 +26724,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "5175e7555a16dd56edb47f92b94e4c2bb9d740bc4b5f05c26256a3c509b40764", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "39bc011f37dd392c41668b79956951d3ff563256943e0f7aa442a2ab4cf84a6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25736,6 +26746,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25787,6 +26800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25824,7 +26838,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "31e87d2eed3a20db57d59b33b9ef8ae255600d6b8f9eaf24d871d791a3ac5294", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a0d7e3075c1531ca381b808abdd7cb878328efa37552df7e6b06bc1ba281e21f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25846,6 +26860,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25897,6 +26914,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25934,7 +26952,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e670f58b311cdaaa7592b8e63a9fbfeeda8d8d06b47306ae2d89bf7d7a1856bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "56296de38781ef502302e77cc25acf95a477834aabfd1958f57dc9dd810bf2b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25956,6 +26974,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26007,6 +27028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26044,7 +27066,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 228920, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c8bc080df22c1bd8f4aa3be0d67c3f2d9276c0206b83b9c5e3ee69c504f987c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 228920, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "40a9aa30b938e93c28f0c1c60cc36bd9cba0379298d67cb853fdb492fc178769", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26066,6 +27088,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26117,6 +27142,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26154,7 +27180,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d6ef4b0b584f97db292f0402c2b8361f8322aab748368528c8a35674f2e3fde7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "0e8f6cf29bb158e2312b08dc14608160ccf3da6696683700e27acf18806f3e57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26176,6 +27202,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26227,6 +27256,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26264,7 +27294,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "e876fd6bff77426c2bcffcb869c905b82d8c2ed91f390939a6a84619fac083d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "dbae25fac979f97bd8352155c5c3455f6ded7fb85ac9d419ca51ac48a7ca9652", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26286,6 +27316,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26337,6 +27370,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26374,7 +27408,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "7b5a67c43e37b42676ff0f83af8d111dbeff28c73bed0d8db5fc0aaad9e342b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "595882189d311d4aa949925ba4d9d2460c5fa98d1340ceb86e26aac2b944f911", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26396,6 +27430,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26447,6 +27484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26484,7 +27522,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "48a20560df4495e092fc341d2c311f6a1d1426b2cd07da90bede5bd11f649ae0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "1d1c2624184081e5324051d28b7587760da0673e646d677bd77de72232855010", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26506,6 +27544,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26557,6 +27598,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26594,7 +27636,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "1ac07f4b804077539769887308c557067d1cb9daa4e88408ef227ba0643e0836", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "6a52a8f3613dbe282caade001b1ce1c6aa5af822fcb16b07e80f130e175a7509", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26608,7 +27650,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26616,6 +27658,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26624,7 +27669,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -26667,6 +27712,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26675,18 +27721,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -26704,7 +27750,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "41295b9b8f4feef92fec4f0f100cf0b0de14d654d2f40090d0fab96ceb07ad18", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "644358aadca5078509f067e9f57b95b3ff10dd69b72b4fe08d2d17d083a79027", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26726,6 +27772,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26777,6 +27826,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26790,7 +27840,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -26814,7 +27864,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "0b08531adaec9d25db77f00fd3e090255e9f7d46e2b5a6e3b1be2cb28aa04d4c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c91c954befe6fc7d0a459bea5d79802a86e29d577cc0430bb611f975a1f619dd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26828,7 +27878,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26836,6 +27886,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26887,6 +27940,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26900,13 +27954,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -26924,11 +27978,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "8727469f146050dcea1fd2e1ac03cf8a13e2e864d1384a05072c4b40f37db95d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "ec7c2a253d27752f7767090709954d92b0e0d60746e56e144ec5010bbd2431d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -26938,14 +27992,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26954,15 +28011,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -26970,10 +28027,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -26985,18 +28042,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 128 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27005,18 +28063,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -27034,11 +28092,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "2b72f37e923c549f3a7f9789aaef29d21b4ac0ed2d60272f4b598ede2ca95cc5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "d03f852b0e1c97fc122b92fe07b97dd08c044750046ea9f1a1798fcb14062156", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -27048,14 +28106,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27064,15 +28125,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -27080,10 +28141,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -27095,18 +28156,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 128 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27115,18 +28177,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -27144,7 +28206,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "5a74150da09a0d526cdd8221f5dc0d8c196f15761051154b0200022e7c9fb361", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "7d962f8e562fcceec40e8bf1c6c8be22512dd90306be9f0192e73b04f8ff73c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27158,7 +28220,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27166,6 +28228,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27217,6 +28282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27230,13 +28296,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -27254,7 +28320,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "3ad91617f6723b393caf579a0f22e2811a517b218fcf5208632cecb76d50d418", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "430def109a944b6574bcba2136dc219890320d96f75d40880acbae93f7b4be7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27276,6 +28342,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27304,9 +28373,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27323,10 +28392,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27340,7 +28410,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -27364,7 +28434,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "f90a6cacbfe3014da5019eeb78d0e6a84b9ebfd0c2923ae6c3f9ca27bc28a029", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "776c40e2c1d6a9f0d4e14abbb1148f7e07403ddccfa78667b2f9d2a6a44c9ee0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27378,7 +28448,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27386,6 +28456,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27414,9 +28487,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27433,10 +28506,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27456,7 +28530,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -27474,7 +28548,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "d23a31d10dcb22d7b74d56844ed884da9001ba59d2e8bbdde1230969eecb9868", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "18793283463ba18e691ffc8658d58cdab69ee5d0b88ec325e1f8c5f95482a4ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27488,7 +28562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27496,6 +28570,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27524,9 +28601,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27543,10 +28620,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27584,7 +28662,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "e678ccbcb93c5edd156afd626a31a586ea2fb27956f1566ff8c5c4d92d65c6e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "d2f6278d497d434118a8eddcc65aea55f2cf54c157d50dbb43f8e5fe53c62423", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27606,6 +28684,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27614,7 +28695,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -27634,9 +28715,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27653,10 +28734,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27665,10 +28747,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -27694,7 +28776,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "00cf1a66a807a1e74fa7539b297fcb77a44e41b6b7474ddd10819a5ccd9d5d85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "d7d81a836e92f7b4999981a696c50794c4f247f575f1081d779042bbe3270135", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27716,6 +28798,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27724,7 +28809,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -27744,9 +28829,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27763,10 +28848,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27775,10 +28861,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -27804,7 +28890,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "aca3b9bd44ab0a955bb71a2154546fcdf158fc356feadf0c9e3db676df519b04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "74afcf62d70577619656adadc45f79e5fbc9d104d8872c595d00e16ace2a33da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27826,6 +28912,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27834,117 +28923,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "ea5533ff75fef077157e38129dade065253062e7c27527413c9c8731cac10832", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -27987,6 +28966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27995,18 +28975,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -28024,7 +29004,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "65076d360152b3233c304c85422125dee0e9d4e6c82af87a032f22c7d8086466", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "7113ed96c5fb373243e6ca45eacaa5fc4ae4824990912f4671d5252ea7a5b9e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -28038,7 +29018,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28046,6 +29026,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28054,7 +29037,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -28097,6 +29080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28105,10 +29089,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -28116,7 +29100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -28134,7 +29118,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "4e6cfd41ed9a5258da4d31c15945ffe9f641be024d8cb0e9f41ae95afc1e54ac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "5a682227d42094c71128e316628d4562cc9a45a2c76dd5b77f98e26337920e42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -28148,7 +29132,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28156,6 +29140,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28184,9 +29171,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -28203,10 +29190,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28220,13 +29208,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -28244,13 +29232,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "5803464557f19440989556285a7193ad58e4d0f3ae128e08e8ad3b8ee9b2b3d5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "9265ebbed68f4ba733007e26b9f96cdcc95d4688907db7cd5568276b866f0b3d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 3 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -28266,6 +29254,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28274,7 +29265,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1536 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -28291,9 +29282,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -28309,14 +29300,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28325,12 +29317,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1536 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -28351,16 +29343,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "ebcbfbe792c15d1a65716f73884edea8deedbb2237ad55b6260e337d31900683", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "e1fd8debd3d4df9a7d5fb39e7cfe8cc47c1ce915de4d5a8ce3706709ca760d99", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 3 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -28376,6 +29368,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28384,7 +29379,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1536 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -28401,9 +29396,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -28419,14 +29414,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28435,12 +29431,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1536 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -28461,126 +29457,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "1d755d8dc22edbeb4cac3c215c0fa239764d8dcbfd97c411bbc3bb340f03f9cf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 4 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 2048 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "b5857e6123971e2abd4a43eb000969fad26c32d9ed3c4a97e5ce9685fecd7e68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "145a2cb97b4d95e9d1eeee9e9facee736ac863ede4e1d2a3e4dda6d4e16f2b4e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -28588,7 +29474,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28596,6 +29482,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28604,7 +29493,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -28621,9 +29510,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -28639,14 +29528,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28655,12 +29545,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 2048 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -28681,14 +29571,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "563a484966fa0a61e1663d451c42b6f8ff26ad06eeed4ea429bd2f46d23c4222", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "acaa3400a66e22eb1ef4854d6a3a6e7647dac46b324245bd04e3ad61f805c11c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -28706,6 +29596,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28721,7 +29614,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -28733,10 +29626,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -28750,13 +29643,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28765,7 +29659,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -28791,14 +29685,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "701d134fa408710e3456c6c2ae962d10966775dde8ca22a70fb70fcba6d7b982", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "02ac5244ac22d85dbc88792498b0f096fb02ac77285e6f3d968c4216a4686754", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -28816,6 +29710,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28831,7 +29728,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -28843,10 +29740,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -28860,13 +29757,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28875,7 +29773,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -28901,14 +29799,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "d0cdfd1fb7b214e16e7736a2fa990a8bf424445877e33ab88340e02775c1707a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "7e4aadcfc3cbcf23bb6b82209ed203f1c180cf442639d48f322c780ba5fc2a04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -28926,6 +29824,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28941,7 +29842,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -28953,10 +29854,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -28970,13 +29871,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28985,7 +29887,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -29011,14 +29913,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "720f237fad7a97489b65e57c35e03ed401f87f468e16bb440e5b8e7f9e79f24f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "e28d864a6fa25d144ec703cbbf0319a33fd39d89b1245b5248c988e1bdbfdabb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -29028,7 +29930,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29036,6 +29938,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29051,7 +29956,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29063,7 +29968,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -29080,13 +29985,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29095,18 +30001,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -29121,16 +30027,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "d043d45d598685216f8ba1d65a8f12b017fe802e9cf4292c8de449f8fffaf6e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "b4be285b999991d6d4d32699f8deecc16df70ba618369decb532de8c92050c31", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 3 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29146,6 +30052,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29154,14 +30063,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1536 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29171,12 +30080,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 3 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -29189,14 +30098,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29208,9 +30118,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1536 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29234,13 +30144,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "0f03b6e2fdf9f246afc442e23ffbc811077616f0f4837957c224ae8c43c252bb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "cd9996919ece26df7ac5f916fa237b51c7c6da077fe8e7add89811da6f05a761", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 3 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29256,6 +30166,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29264,14 +30177,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1536 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29281,12 +30194,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 3 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -29299,14 +30212,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29318,9 +30232,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1536 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29344,13 +30258,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "3788c76d527fff018836a0373b6f64fe037b912f8ffb8de54621e33f6d9cc6b5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "38d9f43d6c0be692e2216f7e9597f47a4e0d60670a204c9bc97bf32c7412016e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 3 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29358,7 +30272,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29366,6 +30280,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29374,14 +30291,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 1536 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29391,9 +30308,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 3 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -29409,7 +30326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 @@ -29417,6 +30334,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29425,18 +30343,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 1536 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -29454,13 +30372,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "93194a94fd4877926edf979df143597a49505ebbccc62a9cdf147b3911040027", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "40184a1075a33631816db5dddb072bde690760179f7983623073ff8c57f64d99", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 4 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29476,6 +30394,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29484,14 +30405,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 2048 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29501,9 +30422,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -29519,7 +30440,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 @@ -29527,6 +30448,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29535,12 +30457,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 2048 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29564,13 +30486,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "8df595bf5ad078ba904e84b00d4077b606d97338bfc4fa29d9840c40854ed256", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "44ee404a49cb94ecb5636088319773b4d755f12bf6f7c0742f96cb36dcdcd163", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 4 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29586,6 +30508,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29594,14 +30519,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 2048 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29611,9 +30536,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -29629,7 +30554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 @@ -29637,6 +30562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29645,12 +30571,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 2048 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29674,13 +30600,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "5f69f8090805174cd4e4afd1bd505a3b28bce2efc4a46b6e130fb58f8ac09351", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "af582ab7cb04999b863fd857f134f369dd4f36c121d5f92ce485e508e82e8f45", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 4 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29688,7 +30614,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29696,6 +30622,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29704,14 +30633,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 2048 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29721,12 +30650,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -29739,14 +30668,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29755,18 +30685,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 2048 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -29784,7 +30714,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "ee3439523963bbc9682bc4affaf5a05f29288dcdb8c5692c9807f236ab6b8de7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "8b46aa771c2ebc3fd53b089599638b032505e8a8947a9f9570ddb124b407fb6c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -29806,6 +30736,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29814,7 +30747,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -29834,9 +30767,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -29853,10 +30786,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29865,12 +30799,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29894,117 +30828,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "c9c42a321cba7ed4145e0ec0cd7aae8be915ebd341f0efdb216e47c45d0e66d6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f", 768, "356b1bf529ab39b95a2a60de1aed74a700cbe71346e1896ada0dbf6fd08f7f7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "e2f7cfddb9cbaecb6614657613f0f2df0f4c5578b7e0658402ac94abd735e25f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -30025,8 +30849,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 1 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -30034,7 +30861,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -30042,21 +30869,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 144 -, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30065,20 +30892,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 1 +, /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -30088,9 +30916,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -30102,19 +30930,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 32 +, /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 40 +, /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 4 +, /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "5e708873c4607cfff47cc6ca7a4ba33d6262c8b87e19e174aa98b9a0f8036ba6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "972b0e7cab5462fbf7aad5cf70d7dc23ec1655964b8bbe064944428c1e194d5e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -30128,15 +30956,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 1 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -30144,7 +30975,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -30152,21 +30983,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 144 -, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30175,20 +31006,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 1 +, /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -30198,7 +31030,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -30206,25 +31038,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 32 +, /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 40 +, /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 4 +, /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f", 768, "645be6164d33eda1c00b15120eee3371fc68af37b3b7a2250dab1c4935c71077", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "c2259e360d1651f581002ba46d6e76a0b751ac0a3b38deab925a58d6bbabb122", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -30238,15 +31070,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 1 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -30254,7 +31089,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -30262,21 +31097,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 144 -, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30285,20 +31120,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 1 +, /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -30308,9 +31144,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -30322,23 +31158,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 32 +, /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 40 +, /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 4 +, /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "7df56cd9ceba03381de7337a973bdec858c7edd23ce3ba897d001efc1d9bc47f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "4176e58ae58240ab99fc026cefdbc61e5678fadb6ff3df42d68345aa1e848039", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30355,7 +31191,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30364,15 +31203,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30383,10 +31222,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30400,13 +31239,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30418,7 +31258,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -30441,14 +31281,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "445e22fc5aed3e74f2afd2c913aac42809360d7d272269452aa0b72c4a8a0338", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "4822e26ba37df0e6e9d9b37c279d291face06f92575aa1dc89be984a03ae52fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30465,7 +31305,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30474,15 +31317,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30493,10 +31336,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30510,13 +31353,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30528,7 +31372,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -30551,14 +31395,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "2bddd6538f9a80ff215903a220015e27cc154e3017aa71dfa6e2727111badbd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "7e84404342220ee4f79ff40b778cffa79091baf9d5ae0475108813d2df6e0803", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30575,7 +31419,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30584,15 +31431,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30603,10 +31450,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30620,13 +31467,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30638,7 +31486,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -30661,14 +31509,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "0684c13b7797a3ff3b43e75152fd7087526bff87a686b21db68f836ff20532a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "e11bef5c553f5b1a3f773f78eadadce63744e269ad4e577e3e90094f704528ca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30678,14 +31526,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30694,15 +31545,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30713,7 +31564,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -30730,13 +31581,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30748,7 +31600,121 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "feda64d64314aeeaf5a65573214c88e10292e65af8250c059f4c6e9286ca2f9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -30771,14 +31737,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "6c6543405ddd91a63005332a83c6170f52d300879c5465aeb7b136f0944528e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "eb675c730c86a205aed8baf11717eb87ab18dca100f44fff0bac729e4eef923f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30795,7 +31761,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30804,15 +31773,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30823,10 +31792,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30840,13 +31809,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30855,10 +31825,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -30881,14 +31851,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "10a5cd60a3c7b3e42cd6b108fb8b3a60859af9bc19f36b1897edfb3b74325a6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "c04f048b76611b2b74813d8f611e9723a726408a736ae1df0524fa0655ab2ca0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30905,117 +31875,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "5886741e6a30f896a3965b479ecaa79c3332e65f360e7031a4896e4186bc57b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31024,15 +31887,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31043,7 +31906,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -31060,13 +31923,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31078,15 +31942,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -31101,14 +31965,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "02445c49e827cac872950069e0ffb4d6eaa83b950230220da8dbc1d95557e863", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "ce9f9cc3dc51379712065de08e5c54a0bceecc17e5c989736c0cf6a71213a512", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31118,14 +31982,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31134,15 +32001,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31153,7 +32020,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -31170,13 +32037,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31188,7 +32056,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -31196,7 +32064,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -31211,14 +32079,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "3d232bbcc8c4f1c6497d77a81d93991864fa9079b993a8b2653c16a91027ba11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "1f7cee65ffe3f8f5f54c69040b09c3fbc3df323be7c8436dd9161f8fa520a790", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31228,14 +32096,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31244,15 +32115,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31263,10 +32134,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -31280,13 +32151,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31298,15 +32170,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -31321,14 +32193,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "fc0a94f1ed97f450f07f429c0ecadccd8eeb9683aeba735fe8c59941f018050b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "7dfa57cf56d9412230731b8914c86a4709a47cef116d478064f7547427992697", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31345,7 +32217,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31354,15 +32229,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31373,7 +32248,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -31390,13 +32265,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31408,9 +32284,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -31431,14 +32307,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "887257c3011ac976d8ad3d9f628389054f379b23e73dcc26b31d8ad721015662", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "7eb6510220f41643668c8efef6b9c5e7b67004da77db4d9256ba9426a2137162", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31448,14 +32324,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31464,15 +32343,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31483,7 +32362,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -31500,13 +32379,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31518,7 +32398,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -31526,7 +32406,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -31541,14 +32421,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "04a9666a04a2a4b8d2d171c9ae6ae88992d100de13223811deb29df20d82ae75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "0e4ca48ddee507afd0058e8e8bd831dc8dc265933278a6b99dbbebccff10d13c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31558,14 +32438,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31574,15 +32457,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31593,7 +32476,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -31610,13 +32493,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31628,7 +32512,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -31651,16 +32535,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "34b34c7e43159332d42a5226a1230bd1288b84974f02dcfe2bacd73d854b30b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f", 768, "0895a763d11e75bea4d2ccbff9a8e8f6302ab45b6251ca4a7af4645f05ff50e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -31675,8 +32559,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -31684,29 +32571,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 2 +, /* mNumRegsPerThreadEpilogueWarp */ 144 +, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -31715,20 +32602,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 +, /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -31738,7 +32626,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -31752,25 +32640,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 40 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 +, /* mNumWarpsLoadSfB */ 4 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "9082649b567a8bc5d7e45e1a749124f3e2f7721f90c455a611277f5ea1d3cdb3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "53afae170050089434427fbce50c144b103bd901a00c8a7e35291c4b214343ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -31778,15 +32666,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -31794,29 +32685,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 2 +, /* mNumRegsPerThreadEpilogueWarp */ 144 +, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -31825,20 +32716,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 +, /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -31848,33 +32740,33 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 40 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 +, /* mNumWarpsLoadSfB */ 4 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "a2f064856d61dfc4f8e31e8a2c221b39e2c940367d08844a1d8b166f1fb454e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f", 768, "2944fce32fc866c4fcd16255497199002bbee5da15b5683ecf095738c339b6a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -31888,15 +32780,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -31904,7 +32799,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -31912,21 +32807,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 32 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 144 +, /* mNumRegsPerThreadNonEpilogueWarp */ 88 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -31935,20 +32830,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 +, /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -31958,33 +32854,33 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 40 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 +, /* mNumWarpsLoadSfB */ 4 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "60ee6d0bd039df49844844f5ffbed2951bd8aba456a875991951368c898cd651", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f", 768, "acff009c3662793b3c92e9bc4c4ca9a654150331d010fa6fdec28e396be749ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -31998,15 +32894,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -32014,7 +32913,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -32022,21 +32921,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 32 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 144 +, /* mNumRegsPerThreadNonEpilogueWarp */ 88 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32045,20 +32944,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 +, /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -32068,37 +32968,37 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 40 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 +, /* mNumWarpsLoadSfB */ 4 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "dca4b39f00c5e17f8e150f884fb42527d6ae96214efb269f30437d9b581dc010", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "219b8f837bb5ec2d7560019a5bb2c176636bbde1cea5449160ba3afe3d976924", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32108,7 +33008,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32116,6 +33016,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32124,14 +33027,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32143,7 +33046,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -32160,13 +33063,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32178,15 +33082,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -32201,14 +33105,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "f6057febca6ef33a96802a148e2f812ca24e95963ab9b18e3658717302807208", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "0fa3cc177e015687be27c13845ec9eda43b3182d9854ffe523ce01c2284b4181", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32226,6 +33130,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32234,14 +33141,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32253,10 +33160,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32270,13 +33177,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32288,9 +33196,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -32311,14 +33219,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 768, "36096626550738537780d40d5a447476eb1b8a4546ab916dca41ff6b912b6255", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "6d13a81aec4f8f4c3067110ff6cddc42d569bcaa2c1d1657d091c7ba178d02d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32328,7 +33236,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32336,6 +33244,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32344,14 +33255,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32363,10 +33274,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32380,13 +33291,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32398,7 +33310,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -32406,7 +33318,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -32421,14 +33333,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "4959925c55ded58259339af5a26f3a370b909173a769298cef72f106838e762c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "d86c2c20b1ebc8f2d34a0970fe83031fe4c621b271eb17cbc06367fbe63d449a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32438,7 +33350,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32446,6 +33358,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32454,14 +33369,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32473,10 +33388,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32490,13 +33405,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32508,7 +33424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -32531,14 +33447,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "f00d48938b5982ed541c47333c9a85c164f6ad51a16e04be30d0d0af6c383120", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "ec55d7175b30dbfc4fd572ec8ed34a6c33d9bf558d0c22995354125dd8f2a4b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32556,6 +33472,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32564,14 +33483,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32583,10 +33502,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32600,13 +33519,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32615,10 +33535,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -32641,14 +33561,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "6fd57fa070371c6e843f38d501ddddb631f5f9a6a7b68984ac82c6be5dce8d86", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "187fd126724246405e3ed91d3b7153578f9ae25009d8a1b38e6b3d9e3f52263d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32666,6 +33586,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32674,14 +33597,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32693,10 +33616,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32710,13 +33633,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32725,10 +33649,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -32751,14 +33675,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "81d38f2f3afc955c18082ef9458b504f2c8f7cb9a174be691f8a13c69736ec51", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "045d6aa32c1f04c1f51652b87664ba516d4cedcd54b2ac819b07d56b6f349cbb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32776,6 +33700,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32784,14 +33711,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32803,10 +33730,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32820,13 +33747,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32835,10 +33763,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -32861,14 +33789,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "fe2d8d92492a06da698a1237163bb63c5d2d7f13b60e03bc53d57a925faac7d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "a2104851aff21b5f9ecd95bb38ac3df8fe234320063ca3c629943b9a87f5f9ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32878,7 +33806,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32886,6 +33814,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32894,14 +33825,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32913,7 +33844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -32930,13 +33861,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32945,18 +33877,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -32971,14 +33903,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 768, "9ef8e1b2b6e665b5bb04ea71ef5719885c32b25c85cccfb94f249754d93bccbb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "86ed86a3dbedc37e898e14254352b61815566d632866be67eb6e9c8a248b7e73", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32996,6 +33928,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33004,14 +33939,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -33023,10 +33958,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33040,13 +33975,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33058,9 +33994,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -33081,14 +34017,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "5d55899b488f30a2f2383fa157057205bf6cbeb26e8d705c92ecc702ec46e9f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "574ac6bcac81b2b03cd658deaefcf5bb5f02099a707dc08fd671b60f35217ba4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33098,7 +34034,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33106,6 +34042,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33114,14 +34053,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -33133,10 +34072,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33150,13 +34089,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33168,7 +34108,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -33176,7 +34116,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33191,14 +34131,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "d7845ba0e9479c3d8dd1ef0621df6ede366c1a8e7c00a01d8517225f8e1acd3d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "7f9166006cce5a422f87a5f1e4f8ea924bccadf8dea14fbc22ec506283ebbb56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33208,14 +34148,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33224,15 +34167,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33240,10 +34183,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -33262,11 +34205,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33275,18 +34219,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33304,11 +34248,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "da13cb1ea87c963709ce9d282a7a2ff9e69641f5554bf6cfe78622ed900b89f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "1272cfa7ed90576061c0e7d6cd38f7d9a49a501bb9e9a1692698d81abededbb2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33318,14 +34262,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33334,15 +34281,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33350,10 +34297,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -33372,11 +34319,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33385,10 +34333,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -33396,7 +34344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33414,11 +34362,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "6691468120670e20e4617c444989aba4cbfc244b81cd14656c462532a79cae3d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "b5aee42a14b2172eb4c23001bf8f3f34077ec9b85d490dcb7d8a7e3f6ed20b62", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33428,14 +34376,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33444,15 +34395,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33460,13 +34411,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33482,11 +34433,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33495,18 +34447,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33524,11 +34476,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "32330d23dc770871dd08be727d555736d9dd3fccce7f55c0818353f4e3ea5e5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "8a7c3a8bc49fe8a6642e7c18cf458c567e753051e03c06aa18080b774415c005", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33545,7 +34497,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33561,8 +34516,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33570,13 +34525,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33592,11 +34547,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33610,7 +34566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -33634,11 +34590,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "961b4c0931a18742cddb17f25d90bb401c4064ca83df1493a9e784da2cb3371c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "1517b3f2c06e4d709d834aed3fa608dc23deca0b7a61194577365f9d63223352", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33648,14 +34604,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33671,8 +34630,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33680,13 +34639,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33702,11 +34661,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33726,7 +34686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33744,11 +34704,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "06be952c3539e25ea13c8a746e40d64e8bfb181c392a512a5abf871c6195bc7b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "2b206069d79dcbc4df528be693d9782709b79796cd5f703029b5a8247b571d44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33758,14 +34718,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33781,8 +34744,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33790,13 +34753,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33812,11 +34775,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33830,7 +34794,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -33854,13 +34818,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "356a82d5abfee0c90a5d660feff497e8675542ed6bfcc15908405a3a996d70f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "616b3943d20160b6edb4c9bd8830cb8b41cf71b0254f93337ab53f816cc06772", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -33876,6 +34840,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33884,15 +34851,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33900,10 +34867,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 -, /* mNumSlicesForSplitK */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 3 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -33919,14 +34886,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33938,7 +34906,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -33964,13 +34932,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "22f9918d86e166fb349e1876bda7dd15b691cd6495a12662c1d702790ddafdd1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "b7d58686994de72eee9584e261afd79b6fed6769f839f1ad82e3869739532b80", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -33978,7 +34946,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33986,6 +34954,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33994,15 +34965,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34010,10 +34981,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 -, /* mNumSlicesForSplitK */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 3 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34029,14 +35000,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34048,15 +35020,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -34074,13 +35046,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "a62bb1623b2d3120c1e06c3c909ea9153c699bee445a92e8bd27c44f0a0b6e21", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "f6c5ddda331b99621220bb18ec0e7d2bc5dc1a4e873f0bf7f46a470a99ea08fb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -34088,7 +35060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34096,6 +35068,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34104,15 +35079,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34120,10 +35095,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 -, /* mNumSlicesForSplitK */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 3 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34139,14 +35114,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34158,7 +35134,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 @@ -34184,7 +35160,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "0b1a6cbaeb5be440d583f0d124b6623bc3d8fb51c9a5bdd2aac844761ece958d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "66bb64327c0c319f7eaee2cc5e8659be20b3972903755b401560d3f5abea999b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -34206,6 +35182,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34214,7 +35193,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -34222,7 +35201,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34230,10 +35209,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34252,11 +35231,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34265,10 +35245,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -34294,7 +35274,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "7c84b57014024bba8edb326d04a298be4989de3c1794d419f9def56c16c1b4ca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "262a9ab729165e696cb303487354a10319038fcfb8f72389ddf7236272e51126", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -34316,6 +35296,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34324,7 +35307,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -34332,7 +35315,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34340,10 +35323,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34362,11 +35345,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34375,10 +35359,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -34404,7 +35388,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "1fea6456b6265f121a62fa9c3a55e70d2476e95ec0502a812dcc1de0447deadc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "01413a3988941fb06742181419b7a92f28d82da6f9e63e17d11b161c2442cd0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -34426,6 +35410,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34434,7 +35421,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -34442,7 +35429,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34450,10 +35437,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34472,11 +35459,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34485,12 +35473,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -34514,11 +35502,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "1279a627948ce7292ca4e1bb78edef63b388a524e72d97ddb5f1b3e5b558ff2e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "60aede6214b20ba266822d4f49341e78c4210df382ba8d7beedfdb24b797d518", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -34528,14 +35516,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34544,15 +35535,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34563,7 +35554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34580,13 +35571,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34598,7 +35590,121 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "700f61041637bef66d9ef03bad48129ef93d645a766d415d3e2fd1a2ccb54e2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -34621,14 +35727,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c8ff8fae8c41ca5b68e8fc13d611134c4aa8460c5eec1fdad0ca93dd517105df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 768, "923b8540a0ee23c930c310a1d77188c8307c3d43931eb9069069c7c4e77c59eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -34645,7 +35751,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34654,15 +35763,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34673,10 +35782,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -34690,13 +35799,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34708,7 +35818,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -34731,14 +35841,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "6ee2de94bb9f050044800cbf29a7d4a97380762d5d8972315eba04efecc1b9c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "e6a6393f8b603f5ff7c562f8009781923f5192aa2737af2acfb8fca5a5d42719", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -34755,117 +35865,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "397bbce8d0d69c73e416953ddc88c0adae0160acca87df7e17ea86fa1d754a62", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34874,15 +35877,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34893,7 +35896,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -34910,13 +35913,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34928,15 +35932,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -34951,14 +35955,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "1fa4ab6dcfe6511262b6413c12d44fb347bff22c6a9769737ad98cb5aec7b600", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "06438f710c38d0ad849f8730e23414de709b39249009aaeec87ff49a85232144", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -34968,14 +35972,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34984,15 +35991,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35003,7 +36010,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35020,13 +36027,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35038,7 +36046,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35046,7 +36054,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -35061,14 +36069,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "0be38a345e02af78b6dae6751934a619a6435e8b6285c909975aeb894133e84e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "afe41d5236e744ed50d017d9dbceed57dbd1019ee52d2ba0e4f09eeab5edce5f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35078,14 +36086,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35094,15 +36105,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35113,10 +36124,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -35130,13 +36141,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35145,18 +36157,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -35171,14 +36183,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "0132d288d39de8583523ccf8cd6f41bf566bf2296de686db988baf0945b63dc9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "7f9c6e7a2ae95c39d68a89c3b9c6f224a1ec9a241b12546e8bd646634b40c786", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35195,7 +36207,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35204,15 +36219,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35223,7 +36238,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -35240,13 +36255,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35258,9 +36274,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -35281,14 +36297,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a6b8f6fe6b225c51e4486af480b8c00e91e3e2cab65cb8f6ad4568cbea52cde3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "dbd271f4b2faef9f3b45ad71dcf0693acbded3e5273f716cec4de7366b954962", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35298,14 +36314,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35314,15 +36333,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35333,7 +36352,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -35350,13 +36369,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35368,7 +36388,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35376,7 +36396,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -35391,14 +36411,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "f4d57caceb52eb44a0ca673c93ec2a0237e78ea8d96ba81e5a14e12c1b7e4012", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "ae10c83eccbb552eaae176ec0ef7a1c25dfe9ec2bfd147164f2a971bdfef2090", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35408,14 +36428,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35424,15 +36447,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35443,7 +36466,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -35460,13 +36483,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35478,7 +36502,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35501,14 +36525,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "753252d5def1582dca4d68a4af2d13bce2ce267c0ac833486d0bdc0835ba2b9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "25a10a9ee5ef81ac7334211ae7dacba95ff8f2e3f48f3a0a89200b520ee12b43", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35525,7 +36549,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35534,15 +36561,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35553,7 +36580,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35570,13 +36597,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35588,7 +36616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -35611,14 +36639,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f3092af1ebedd1adf9b42852d24d70e225a401ce4b4a14f0335f15ee23bef8da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 768, "ef7280bc148704a480100dc3f5729395a5d00be44ab4b970e909972b8e406c57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35635,7 +36663,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35644,15 +36675,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35663,7 +36694,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35680,13 +36711,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35698,7 +36730,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35721,14 +36753,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "cf819a3db90b2ce7f430c018128a28c00f6472eb161969dc6f7bd5502858b76d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "e7baeadf83baa95b01038f3314a5b10f1f26af3a766896ed26fb025dc38f3d57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35745,7 +36777,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35754,15 +36789,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35773,7 +36808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35790,13 +36825,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35808,7 +36844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35831,14 +36867,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "00b4d0839dad846db403a94108e1e153740d7ced590548c75a3ff4ce65428259", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "822d2a5ce71af977115d4f4bd4202cd28b2670db1dcd102393e27bc08baeadcb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35848,14 +36884,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 0 +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35864,15 +36903,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35883,7 +36922,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35894,7 +36933,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeB */ 16 , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 @@ -35902,11 +36941,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35915,10 +36955,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35928,27 +36968,27 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 1 +, /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 0 +, /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 , /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "b7cc298b36951f6d1d20d1a6e7839e11945b6de2256f419c8aa6f7cb92003971", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "fef23f8e3d9fe50d35c3052cf030d05748b4c3b140b2609b0e112fc711c1a7b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35965,7 +37005,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35974,15 +37017,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35990,10 +37033,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36010,13 +37053,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36028,7 +37072,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -36051,14 +37095,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "900059ffdaa34c18b634a20e2424c2b19a0664687a72e3c43268c7da29e2a7a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "fa8fd5c4dbe59e8615e4e70482e8edbbf0d3f8a1de24f889dbda5fab34693407", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36075,7 +37119,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36084,15 +37131,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36100,10 +37147,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36120,13 +37167,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36138,7 +37186,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -36161,14 +37209,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "393bca7f8fa59bcf1a9c21014c2d78972af57997aba3dbff1ed1bda9cfe8a967", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "173ab52cec3d313199534f7c7176d6853659a632500a38fec9fc38762ccef496", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36185,7 +37233,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36194,15 +37245,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36210,10 +37261,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36230,13 +37281,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36248,9 +37300,123 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "7e4390b97295ab91bb5b71e33f3ecc10432a1f2a0057ec74f6330b57b21a4ca2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -36271,14 +37437,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "2c2faf119a3f0af582712e1dbc97711289f76d5ea506d538531a367631f4a776", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "53da50f6b70dee4eaf7965027dd8e1c2a7601cfc6cf71368dfa7c4977ef33ccb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36295,7 +37461,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36311,8 +37480,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36320,13 +37489,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -36340,13 +37509,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36355,7 +37525,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -36381,14 +37551,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "6dd90508e3094ab31e5cff83dd120a55d08ef5348465d734818290d71d797e8f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a57dc3a61da9438ad5da55245c0a454a7b83557a8e38ca30ae8b90f30a955da3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36405,7 +37575,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36421,8 +37594,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36430,13 +37603,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -36450,13 +37623,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36465,7 +37639,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -36491,14 +37665,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "0249a035b5d34084e0ff2648f0a737cc62559458ef2b2b0f32b7b678104bc096", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "edb0f39df63835429697e6af5fe1393d7e9d12ad49b873a0cd70bd87375630e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36515,7 +37689,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36531,8 +37708,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36540,13 +37717,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -36560,13 +37737,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36575,12 +37753,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -36601,16 +37779,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "e4c1947e0c1ff623224d9f94a31bc7aee5c4d2d845e2069b50a4c58650c83941", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "83607c3044321bf831500ea393803f1a32bad4b2406e1b1f4d09cc83277f48d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -36618,14 +37796,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36634,15 +37815,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36650,10 +37831,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 2 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36669,14 +37850,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36685,18 +37867,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -36711,16 +37893,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "5d5be167b587320edc504bb67de7181317fd5ad628c3a674ed660fcaea031ac5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "3ad9c82758a5acfd29580061a9d6dff9b170c0fb486cb34af701a4e38800a727", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -36728,14 +37910,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36744,15 +37929,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36760,10 +37945,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 2 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36779,14 +37964,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36798,15 +37984,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -36824,13 +38010,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "36d6390f160850e1fee94de15e4ff9a1f6db198f37f00d28fdb28835f670a548", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "399cba5e92d365ec8cbd55ddc38c42350cc03808017071648a15bb7a758ed8c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 3 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -36845,7 +38031,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36854,15 +38043,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1536 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36870,10 +38059,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 3 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36889,14 +38078,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36908,9 +38098,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1536 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -36934,13 +38124,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "47e96f8f743648afcd645487c00bdbc7cce8f94308b69f4a185f3dee7617e012", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "dae1a7c8a29ad0da1e32f3955fddeedc260ac17c71dfe568e57045fd73f7f085", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 3 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -36955,7 +38145,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36964,15 +38157,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1536 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36980,10 +38173,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 3 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36999,14 +38192,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37018,7 +38212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1536 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 @@ -37044,13 +38238,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "fa05f522814e760e8578f48d78b251dff0819cf733a4b443af426753ffa1b5cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "0baac1d015679e6499be16cd7bb7e1e0eddc01ec50659c143bcb7300c729760a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -37058,14 +38252,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37074,15 +38271,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -37090,10 +38287,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 4 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37109,14 +38306,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37128,15 +38326,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 2048 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -37154,13 +38352,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "18fbd102638dc6dd8b70f2e799edd16683e1fdfa6f08f7b59ffc10e120bd5f7e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "fcd028ced43e1f9c138f50b5b3be620a04778e381bbc2f80f1a4a3171ff2a7fa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -37168,14 +38366,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37184,15 +38385,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -37200,10 +38401,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 4 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37219,14 +38420,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37235,18 +38437,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 2048 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -37264,11 +38466,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "0d11ea83d184b85758152df122fd685436fac066200012b7e51a15c7be50ae56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "50a798f26e98d26c9c2ffc30e5641f2f41bc957b695a77fe9fa44faaa924db93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -37279,13 +38481,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 0 +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37301,8 +38506,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -37310,13 +38515,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -37324,7 +38529,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeB */ 16 , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 @@ -37332,11 +38537,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37356,25 +38562,253 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "2f9761756913a37c3d811777b3bc9e2ff93655c28cac997b83bb667018690b60", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 1 +, /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 0 +, /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 , /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "6e31aa64887999689664132c637495efefac50392e32a72956352b7547369be0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "dc6ac074a30d89ca532d978c108faeab68578a5df50a7cd17d06676fbb9a005a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "645177775c9994dacacbc15dd9fb598262c55782106182f151093667eee604d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37396,6 +38830,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37404,7 +38841,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37423,7 +38860,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37440,13 +38877,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37455,10 +38893,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -37481,10 +38919,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "acf03f8fb6440c402378a65ffca4cbe764b91fdc5b1b4aad6271eea9e7c04008", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "9c2d7ff913b57de65bb0ba05dc8fc5b37b1366bfadf54680bcca861f530d401b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37506,6 +38944,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37514,7 +38955,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37533,7 +38974,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37550,13 +38991,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37565,10 +39007,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -37591,10 +39033,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "e570048da60df3d2179f2f437b9b1e7a7e8df1b75fe27e1337ffd13f7a2da955", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "c7197e22b59815db21c6e30723db7917e8b6f0c01ca8fdf21be37f46c3e0398f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37616,6 +39058,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37624,7 +39069,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37643,7 +39088,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37660,13 +39105,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37675,10 +39121,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -37701,10 +39147,124 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "abc2afc519706a7f0e7e56bca0ca950c1d342a72a55ced10ac393bbbf188a22b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "ef25f9ba3a48ff637c201fc737c7ec34fbdb16096914e830a45260d8fedce591", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "237a36b5965e322879d05c2f17716c2d7f627ef7373c0625d190e63e7a614f85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37726,6 +39286,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37734,7 +39297,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37753,7 +39316,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -37770,13 +39333,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37785,10 +39349,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -37811,10 +39375,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2b8bdc1546ec47ffa676bc6c5f5e3cac5d5102c2e9b4b65a761dc64460c142cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "ad669e907866158b333ff52bf694b922088d0f1378636b3997641d7a945bab2b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37836,6 +39400,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37844,7 +39411,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37863,7 +39430,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -37880,13 +39447,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37895,10 +39463,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -37921,10 +39489,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "4d6b0827257a95a38d389b2b43b7240ee9f61a2615c0f28267c7335bf5fd4f3b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "a2e8adb2cd75e41383a3efecc479a3c21c340cb1d6d164d473bb881d6b3a18c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37946,6 +39514,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37954,7 +39525,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37973,7 +39544,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -37990,13 +39561,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38005,10 +39577,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38031,10 +39603,5824 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "0f12db5f5124f213ec4f70e7ecaab4f9bce3fa56d65e7d7ca9bcd54bbf5509b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "7c019518e8c168d95cb18d80c319a0cb5ae1076540aedd2c872b69899a7a9669", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2d114c8d12de713beed820d99891198ce613f28dcbf6946977aef1bb5ed95063", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "9e1e9f025d9ec5dacc0358fbda868961479f711e607784c01865316ba61fac04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "7dad4a9f4e7b8ad97da964f0798c886f97e64fb5b396046da921b3825e853ded", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "66f38539fa36e77f4372630157bb395b8d09b714d4dcb01b80213d1ba1f93e9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "9c5295e215792ef5927d373c64b43b067f2d50b925f975ddbe3fd759202c8bf6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "d74ea21f2571bc36bf9cfc9bf5ee604df5f12a2cbf3e92bb7f5435796635839d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "ab8234cc482753d5705fca47a1bf31aba9947ae5efdc8b132b06abb717af677d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "580bf0d33f76d04487f683f860e7a8e0f512796e0ff255a8de21b4822bfb1d23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 2 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 0 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "54ca604ebdadaf7d0da1400c7b78db06b78bc893aea8bb8bc4f9d9de43c21858", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "118e3a9ff7ec8648c5d491a8f3906fee2d76e47e581c53ae5441ecfccf50a1df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "2dae6169beb9d4752f108ad026467317c26208df6881b7fef62af51cbbee0f6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "9d0af31104ad6748cfc8584e5fc0e0d6bb67ee642ea3ef0be5d4c93c2c33f32e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "8740c8dc6c19dc8f3d3e16bc04564f313063dc296174a85f079cb9cb06dc03f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "1f8812933f6a4fcc3d9e4b6fab0f5fc3744855141640ef747ba0e74038db5ab3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "60e42c5cb21a203d1889812a96b00203630f8c0f29bb9686cdbf2f0080e6435f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "95cefa483302f4ef06ede27fa897c5433a40594d62c1929d277440576dae5dd3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "d4193df3d41d70deb72073e6b1e851caac9929116b3bf84d372769a5be32921e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "e8abf24c5dbb7e8df06849dd598e80d9b165708d1bd09a5316bbe77f386c70ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "c13f80bbac12b80bf289ac79e6aca89dffed68cce4b5622192d0720621d932ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "c1a788a6007c2ce21109b1c59efda2754d4643e7025feec33786f53f14331ca3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 3 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1536 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1536 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "0026b6867d4d2fa0602edb807ec7c239f1417a26db2b251cf5affd1726d5e23c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 3 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1536 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1536 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "8310df3140a4481987cf9ae24ba172a8e8f5a1e020f789d649062847b330fddb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 3 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1536 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1536 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "949db9eb4615d2fd91a419a78546bfc39136e00b2c011345a2ae27bbc693a98f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 2048 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "0f15f12b03cefeb3072a9cfad400604bb179ddcb5f98ed2a164e99ad7714765f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 2048 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "867b5c25e10baf579b42512d1f91744e7ef74bfa083765c7c8e0584a04f7d46d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 2048 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "44abbc4793547b61c27e12bb9d85eb2866e1c8169dc373af4cf0b2b6d455fe9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 2 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 0 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "b7c4ecf7051f8011a028935254fb1868cf99b6763500ee83ba4d7be9d2b70bfc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "33ac935bcb9c19b9ed821c9537971cb20a69a2aef485629a3d44d6f88675906a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "2d1140fef9c613e127e8fce5af20e397e42b425a9e415fe9bdc9cc48fbfde980", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "41216d23c56263d53e83233aa8fcab11d01015c60df98304910ee43bf0190633", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "e475168e5526fd76f1db1f9964dcc8e03464b904b7a10786e5964e7694f23142", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c0d6b926ca83414946b4216cfeb133a764396e2fc2a9198077ec6288d0dbb8b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "65a12a1988ae76d24f8a45f314874b195d41924ed459be59b338f7cdc5ff874b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "cdfae91fe388d3cfcb860271dcac66399c7ce433daa0e6541c41aa00f3fe6938", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190592, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "2c0a49a404a9d6f84a819c3a16aa676b7c7a3e0c1e060ff11f3ebdcca0d7a9a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 152 +, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "c2f4eb86c167dc614e1def2299a2a4b694c1f71e78730290921866bc5896c5ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 152 +, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "5197372f88e928edcd53bdd42d38655b95a99e3a4cd154db14bb5d607d649191", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "399ac758991dcd2c288d0e9097adff7a8235b4e3a579b43fc8ff51d29e0cd557", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "2e84a6f316a6c50694c9d9cad8f6789e6e99edac1d53e98b0dd4a4759f1c8a9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190592, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "6fb07486e24dc10d2b1b5c3659b9c1f6b6692e09e6e3a6e7337322e492bfdeba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 152 +, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "e9dfb21e23f3b8d5df2a65822a00b305f23e69a236e94290138db3b4c2baba04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 152 +, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "6e1acd66eb3603f9c0d3f4fb22c0ab39632ae96dad577a552b834732b5558d2b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "3707d93bb2924745892cfea4d8a32a9eab9c7e6a2f75afbc5305e75ba0b7d885", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "9efb2838900f1924849a121e766fc36a2e67eb6d44ccd774d18cef139bc4ccd7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 119216, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "7184ceb12c566648b528e959852c6107f70a31c8f8200dade0a401b5a2b4e0e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "58efd05b2b1aeddc2bd275161f726adbd1441f5060a283d455d5019b42096fcc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 119216, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "c00a391f99417557458703ce2d583e09067349a78ecc0e4dc2a59026e6afa43f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "41694ebb37e65a93906a1bae0a04b86e723df2ec2f0b339995f82b85df3f7c73", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c802575473c80b5e47982228ff9b4c55968574e61d9ea7f0b0985f0021a5bb11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190592, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "10bea78630c8739b73ab0d765e7573899e908eba9177204c507fcef4ffec4545", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "b5595895875f5b9caab4670fec464d635781ecc644966b7b3783ae734c1d867e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38048,14 +45434,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38063,29 +45452,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 152 -, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -38100,25 +45489,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38137,14 +45527,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "34b81cc5b1ac2e1cacaeeffc8b32090a1c297e545a4a8cf3b4feef86ef9afcd5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "53ad7ca2b6c622b641ae2d3a6890d8ac37fecc0ef6b2b39f1b19d401e73b9c29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38158,14 +45548,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38173,30 +45566,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 152 -, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38210,25 +45603,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38247,18 +45641,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "1c8a14377cb53045a9917fece018815c92c95f14b43ee477e826adbc362486ac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "01738e4ee57b20ea965a25ee6b8eb15b252a2916bc828e6db05bb5516d1d0241", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -38275,7 +45669,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38284,29 +45681,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38320,13 +45717,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38338,7 +45736,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38357,18 +45755,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "1a978d243f4254610a238c87af9139b245e66858136ece17b36ede2d00aa3d7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "5dc7506bb064db53dafa33c77d446fcf25416de72ea499acce0a92b90447e18d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -38385,7 +45783,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38394,29 +45795,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38430,13 +45831,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38448,7 +45850,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38467,14 +45869,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190592, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "adab4ddcb23d3f925e3bff9d29acf1e535e70682f15b6ebfc3b6cbc3fcde881e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "db2b888a564c59975e92f4dfb7273a0e288820e30de55da7b5f658bde520c068", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38488,14 +45890,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38503,7 +45908,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -38511,22 +45916,22 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 152 -, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38540,22 +45945,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -38577,14 +45983,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "cfd70975aab9bf6bf6cf74f0fa8f9b951a46ca0dcc4dae74bbecfeb5ee7b8cc8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7eb947a1deb70da086ec6198145c44d6ac96949cfdd0354e79714252beea86f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38604,8 +46010,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38613,30 +46022,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 152 -, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38650,25 +46059,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38676,7 +46086,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -38687,18 +46097,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "bbcd503930c9ef8efe7841fc8ca42af5a06ddc8639606176157c36acee81f677", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "b9ffd797f2cf1d5d3e0e12cda3c25352b983a285336823550f5af5e45fea35e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -38708,14 +46118,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38724,26 +46137,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -38760,13 +46173,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38778,7 +46192,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38786,7 +46200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -38797,18 +46211,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "9ee73850653d0f04787afa9cb5ed26e2f83c65b6839471294774fcf6fa885fe7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "919feb308d711a56b7dd48fe5fada2891b25c6abfd4e3147e91715cfae7694f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -38818,14 +46232,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38834,26 +46251,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -38870,13 +46287,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38888,7 +46306,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38907,14 +46325,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 119216, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "de5234b4efc3e2be2ba7897c3d458c9d8e995a69a7695abf20daad89a5644f20", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "692a853eb3a8a63d77393196fd081b342696009d4dec8c9559197e84d263a56d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38934,8 +46352,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38943,15 +46364,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -38959,14 +46380,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38980,25 +46401,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39006,7 +46428,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -39017,14 +46439,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "4deabc5dd45af21dd1e618bd9be6edea7b3fd658d3758b3e7871c1a7a3347fdd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "996263183ed779e9843bc72e90474db2e70ead2bf6f8b84a3c2f1ef34e42e185", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -39038,14 +46460,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39053,15 +46478,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -39069,13 +46494,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -39090,25 +46515,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39127,14 +46553,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 119216, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "9be3d8c1dc22c93e4a373d7ab12c7cfc5b8c66c599de1a24efa46b1d736629e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "d083f530ef8a430031beadcd74c696ffeab9fd32c7ae4f3b725d87b3dc478531", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -39148,14 +46574,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39163,15 +46592,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -39179,14 +46608,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -39200,25 +46629,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39237,18 +46667,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "e76d65e80d21afb207a8226ed64113ea644f016c512aa78858ea3442e1df6839", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "c3a79514c7d3016530f8668761962bf7c289f787b344b4cf8ad0671bd4a69911", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39264,8 +46694,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39273,30 +46706,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 7 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -39312,23 +46745,24 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 192 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39336,7 +46770,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -39347,18 +46781,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "76616e3ed4db323c68047f2841bcb2dbc6723f7f53317bb910ca4039823266f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "c22befb7beec2d63e1d12ce3c41d0167f7a38eda9bc66d2cdff34a30a4a32773", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39368,14 +46802,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39384,26 +46821,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 7 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -39420,13 +46857,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 192 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39438,7 +46876,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39446,7 +46884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -39457,18 +46895,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "c8c927b723eb7272ee8fde04cc050e89b9cec61716d98f0497a33214b9c42af9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "08a39f242c76471499958aa5480bbf91821f9900f0234e89576d571df77055ee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39478,14 +46916,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39494,26 +46935,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 7 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -39530,13 +46971,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 192 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39548,7 +46990,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39567,18 +47009,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "0b6d17625ad09b10e807cd697c332691c32cd8b0c78609b80799a9f25a238a3e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "e0eccdcea0d80b003331c214a5ae9755acc3adfd4b53cf94ec2251daf2f59e9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39595,7 +47037,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39611,22 +47056,22 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 7 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -39640,13 +47085,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 192 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39655,7 +47101,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -39677,18 +47123,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "9bb137cf01f5a31aa59466be5ae2f7a8655196e4576ec42fe5cc67bae62bc854", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "32e2911a5a4026509518fad5fc15513582c54fe369d88d3a9145cb8bc0dae3d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39705,7 +47151,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39721,22 +47170,22 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 7 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -39750,13 +47199,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 192 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39765,7 +47215,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -39787,18 +47237,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "77ee796c0972ff675edef13b3c1246f046b49c17a80d5acfa1158c2b5357407a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "88b8a23cbe3f90e12190f600a2d4aa115a7a8438e4f9747f256d79baef9345ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39808,14 +47258,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39824,26 +47277,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 7 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -39860,13 +47313,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 192 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39878,7 +47332,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39886,7 +47340,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -39897,18 +47351,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "fc757d774dfc01f8ee56f0fac9aac14812177063b3a34043d15eea9388665702", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 640, "67613c8eb398d7053092b70585abf7e7165bff4a480327df40fd1945e7e14e13", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39918,14 +47372,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39934,23 +47391,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 @@ -39970,13 +47427,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39985,10 +47443,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39996,7 +47454,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -40007,18 +47465,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "896d43bc99e892544d14e6e1c40fa225495ebe7510f24b97263de3beca8846d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "07776452cf965f64bbe524602fc6d36ecfa69e5884599ea532a9910126b493e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40028,14 +47486,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40044,29 +47505,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -40080,13 +47541,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 256 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40095,10 +47557,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40106,7 +47568,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -40117,18 +47579,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "5565c5fd80c185d09bde0e197f7426cf75f15178dfe68ae9beb58b3662a9297e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "a4eca2369f72e396fe2b2ba093f8c2055cc146ae99cd407cdcc11f493c7a4105", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40138,14 +47600,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40154,29 +47619,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -40190,13 +47655,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 256 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40205,10 +47671,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40227,14 +47693,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "c230ec4a113e6e2d5e21ec5ed351c1cb172b845158dfef8444cde4ee131a8b7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 640, "86583677036311815e42574f9dce3319a9505bbde64b0e74666027bef64d4348", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -40255,7 +47721,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40264,7 +47733,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -40272,10 +47741,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 256 -, /* mMmaN */ 192 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 @@ -40283,7 +47752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 7 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -40302,11 +47771,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 192 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40315,10 +47785,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40344,7 +47814,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "8aea9748c1819a60940dfa078d3e75769b7b939426736f48c8d1363b56436e6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "1ec8d45fde137b3021001caf86a49cbdfd3a2b693f0512b781fcb5d1b9551fd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -40365,7 +47835,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40374,7 +47847,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -40382,10 +47855,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 256 -, /* mMmaN */ 192 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 @@ -40393,7 +47866,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 7 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -40412,11 +47885,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 192 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40425,10 +47899,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40454,7 +47928,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "1ae27eca9a3fa771f1b46f6f8cdc29d06af50750c786c1b2b40c38bd6782f06a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "e2783079f767599363804f5f8f1380f3922743479d55b61a04b8cabcab734356", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -40468,14 +47942,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40492,10 +47969,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 256 -, /* mMmaN */ 192 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 @@ -40503,7 +47980,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 7 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -40522,11 +47999,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 192 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40546,7 +48024,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -40564,11 +48042,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "53da81b182577367107d4ce72907cffc32afd1cd30d20e622a5e9be71d3b13ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133552, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "b2f063134d9204f08c963318288aff012908bde2b19f828d9f7f44da5e556671", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40578,14 +48056,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 +, /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40593,29 +48074,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 192 +, /* mMmaM */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 7 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -40632,23 +48113,24 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 192 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40667,18 +48149,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 640, "21aaf76c12b3fc47df74bbd86c0384deea1e8ce9dae233b482ca12c075e1db74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "411b5a1fc76ffa400f339d80f019fd0cc8f567505d4f2a4945389d35f920bd98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40694,8 +48176,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40703,7 +48188,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -40711,22 +48196,22 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaM */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -40742,16 +48227,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 256 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 @@ -40766,7 +48252,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -40777,18 +48263,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "93a9379ad1f72d9a3603275c88618b1a9fc95e96afbf429d5f45a2d508475cb7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133552, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "f496b3e4db0810bca13a238b951f4d388989c1c4c6ecb32d038c11cf1a59c8f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40798,14 +48284,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40813,29 +48302,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaM */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -40852,23 +48341,24 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40887,18 +48377,132 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 640, "47fe3d4cb8dcb6e2e012ac08c19bc445ef0e55a37f9f7a523a9cba4d7d050ea7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "ad2c4d86e29eb5509a1c2ded57801a3016941c5ae27251ff763ad939a04fce95", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "779847477fd4bb4da5c0bd609f694e36a161e34f113b83e7c23db34fed03a69e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40915,7 +48519,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40931,19 +48538,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -40960,13 +48567,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40975,7 +48583,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -40997,18 +48605,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "9819b6c04ea4707483bf7fee5e05d1d35f4c1e7a834b501864e606439c257617", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "e3312bef21c92eeb469a4d59a69a0e4390cac5caf53441743242ee76062a2d7a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -41025,7 +48633,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41041,19 +48652,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -41070,13 +48681,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41085,7 +48697,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -41107,14 +48719,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133552, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "e239d53043619321ba67def1c1e568db27762c086b5403b5b3144fb34d00bcd4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "0bd42b53bc39c5ef383c16a53e26dae8d6d2dcbabd57dec3d98dc0ea6a7ee2d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41128,14 +48740,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41143,15 +48758,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -41159,13 +48774,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -41180,25 +48795,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41217,14 +48833,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "afe96180a941e897e032bfa90b3f3be1f5e364c570bf4a6936120b88190b5df9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "877eca3c7b7db387e8d0833a0cad9cd24382d247eb74f1a989e46d58a451a8b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41244,8 +48860,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41253,15 +48872,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -41269,13 +48888,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -41290,25 +48909,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41316,7 +48936,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -41327,14 +48947,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133552, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "0ee9c206ebc62b3adcc210817908f2970177d94dac95a3ead6e8f22b6c957a6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "b558ba1579ef08f2ed2b3a1b170785e9748e58d3559a51e32b5b7cbb6d3cf54d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41348,14 +48968,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41363,7 +48986,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -41371,7 +48994,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -41379,14 +49002,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -41400,22 +49023,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -41437,14 +49061,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "c486f36d1360bb4b6b62cb72961abbc1ca51aa5b4f87577c3dcbd06e3c87dfec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "befcf2a22cc3cf63262518d0b0a41d4b4419544d5e276727f563704d6db12a2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41458,14 +49082,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41473,7 +49100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -41481,7 +49108,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -41489,13 +49116,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -41510,22 +49137,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -41547,14 +49175,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "780d767449edfa48e39cc780f4e682e0d645895ad0dc00c4785f8b602d4a419f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "62ff3b8c90693617df3001d326aee7a6930acc7a63ca2f6e12e2393c46206685", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41576,6 +49204,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41584,7 +49215,121 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "7715f47c5135b4bb261721e71f95e5504fac6ee8b53d822b7382ba6df0a19e83", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -41627,6 +49372,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41635,10 +49381,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41646,7 +49392,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -41664,7 +49410,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "8bc7a9eb910b1759647cc869c7d76302135de4e666655db998ba870b5b2519be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "25d50e75c7051a0124c73b74ef05f4ef125b7a74947756fa32dd1993e88ec287", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41678,7 +49424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41686,6 +49432,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41694,7 +49443,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -41737,6 +49486,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41745,10 +49495,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41774,7 +49524,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "96faae3fd1d5d672746ceee471ea645ae791a2cb7bc95b363215b3acd402aa92", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "639f2a6cbdd30ded2f20c94e1dec2c51c4186be449b6cffe341f62a6506f20fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41796,6 +49546,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41804,7 +49557,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -41847,6 +49600,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41855,10 +49609,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41884,7 +49638,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "45ad4c2603016f88b5c3f9a43bf744f77854f97c09ff3950dbab7277a88a00f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "e92c75c751de4e213a73cd3cb24a0f1cff45a669b2a51c08b45b2557d643b3f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41906,6 +49660,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41914,7 +49671,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -41957,6 +49714,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41965,10 +49723,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41994,7 +49752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "aff4d9bacf5e87172a64929b6220e5a4749e9d60ac3d838075c5721549e277be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "dc2c960237809caf9b53824f03f6c3dbede7487341cf68ed87f7ee97987d6f43", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -42008,7 +49766,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42016,6 +49774,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42044,9 +49805,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -42063,10 +49824,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -42086,7 +49848,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42104,7 +49866,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "2121822aa18c44e48206609dd993983225e3b008694c0f9ee886f654cf90776e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 164016, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "68f1fed01df47d98b7e50e165405cf017ccc193df3f1dd6908668c0b4ec26f23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -42118,14 +49880,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42133,29 +49898,257 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 64 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "86f01acb60e8bde158dd8646193473fa183240b3a3bedd403da2e0130fe3f026", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 164016, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "fc58e912146d1f7b93f8492ec7f76343befb242d15a95396173d8b3fa01d3e74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -42170,25 +50163,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42207,14 +50201,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 4 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "0605a1e713d94d18eebf51f2c4cd110fb426cd13eb25377abdaacf50b459a746", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "63449a3fc3950e963844eb49a4e617b2886dee8cc5da2deb88ba96425f9fbdbb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -42234,8 +50228,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42243,29 +50240,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 64 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -42280,25 +50277,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42306,7 +50304,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42317,18 +50315,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 4 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "fad72e41f566cf086d8f9f9a2bc3a569f882b780ccad4819154e8a2886725bde", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "e5d30f53a37b9d7d546117c347f1f3a692996d7e18a79130621149742615f401", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42338,14 +50336,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42354,29 +50355,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -42392,11 +50393,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -42405,10 +50407,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42416,7 +50418,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42427,18 +50429,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 164016, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "14b4121f74c5325fdaa9eb893f28ffff7fdc6a4c6b4bbab0352ef86060d7c229", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "848568de867415876ecc2324f27f6aafbe15a8ec375a52d9a76135a8553da762", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42448,14 +50450,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42463,15 +50468,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 256 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -42479,13 +50484,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -42500,25 +50505,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42537,18 +50543,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "22d2c93c7ce2d6b262259d4ced9585b578adce682b37bbbe2445e505b299f321", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "c59966028657efd1eb71cc5441718bd3b749f076fd10fa7c1637cee0db2181b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42558,14 +50564,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42573,15 +50582,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 256 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -42589,14 +50598,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -42610,25 +50619,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42647,18 +50657,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 164016, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "dedf038750ba4522b0cc81e393705d652b65089173e45522aa9898a7968778e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "3da0b443881c71ca893f51a171877436394a32bd828f0e84d9861e7318d17b4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42674,8 +50684,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42683,15 +50696,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 256 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -42699,13 +50712,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -42720,25 +50733,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42746,7 +50760,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42757,18 +50771,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "c92f86f3267cd9cf3fa61358213156b1771f405265b50118cfea4b24abb72e57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "3d17cd4accd5e945ec1eb6c119ea446a3642db2772db232ee531f3c5615786ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42778,14 +50792,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42793,15 +50810,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 256 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -42809,14 +50826,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -42830,25 +50847,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42867,14 +50885,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "97c17ccb77239cc24f67749ea08bbd6b9ba9505742a5df82616bc75e10d926b5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "b171248735734be7863fbdd224ba8d3eab0d3340afb46941125aadc42ff53d39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -42888,7 +50906,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42896,6 +50914,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42904,7 +50925,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -42947,6 +50968,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -42955,10 +50977,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42966,7 +50988,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42984,11 +51006,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "d707910d4b43668d45a660c10853105334ce986ad5d7a8fd564fdb20ddc4ac75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "caebbaf5771493937cde67c16b6677a764139c37b845d26c2fc3318045fa7a5c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42998,14 +51020,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) -, /* mEnablesEarlyExit */ 1 +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43013,30 +51038,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 64 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -43050,25 +51075,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43078,27 +51104,27 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 +, /* mIsStaticBatch */ 1 , /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 +, /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadA */ 0 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 +, /* mNumTokens */ 0 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "fdc7f5c9ef8f981f6e7cfde61ff766ea5e9ede83ef7314155ccf95d19c4c829e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 148240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "846b532c796dc0e127798665c6524e7d019d85fc681eec1bbdadfc48c7d8d566", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -43114,8 +51140,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43123,29 +51152,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 64 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -43160,25 +51189,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43186,7 +51216,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -43197,18 +51227,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "afbbfe45241b22715d97ca37d335ccc77369f7a41ea68688d4b76febaf8e3d60", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "e12aec7b0ed57c48c93b429221281667d69882bc77a876f933acef919cbd5f5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -43218,14 +51248,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43233,30 +51266,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 64 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 8 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -43270,25 +51303,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43307,14 +51341,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "5e2e258dca62724ab45922f00bc5652a05bad01f9823a6249809ab09e15be569", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "efc37cf74d1a51fe2bb804060927f47810f9ed8f9640debd4f5f25c75533769c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43336,6 +51370,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43344,7 +51381,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -43387,6 +51424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -43395,10 +51433,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43424,7 +51462,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 148240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "36773cfa9b99b0ff4d25c5b2a552bde2d0edd1e2bb66c2e1a31a098a5b9519be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 148240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "08888a00664ed94f652eccbf1e643e4fd127fa3affe599709e8937e7b0cef514", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43446,6 +51484,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43454,7 +51495,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -43497,6 +51538,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -43505,10 +51547,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43534,7 +51576,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "b708c6e1a57de0692a219e8ba73663d6b8fcca40e6ec65c2cadc60693819eef2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "aa028807fd5a6a7e6d74d8ba1dbca40a88a852c3ab39baa7c810ce97e7b216b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43556,6 +51598,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43564,7 +51609,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -43607,6 +51652,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -43615,10 +51661,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43644,7 +51690,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "11328576ef18d587a24abf0e95dc6459531509088a8a4a814658941d9363d4ab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "e47ef1efb0abc48b8e8527f980306b095ed7b4a7c855abe1ab847538a9ab104a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43659,13 +51705,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 0 +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43673,7 +51722,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -43681,7 +51730,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -43693,10 +51742,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -43710,22 +51759,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -43736,25 +51786,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 1 +, /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 0 +, /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 , /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 148240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "e7c95d138fb8526a68525f61fcabfa378b6ff0e767a5dfad18c4b1ed27c36251", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "c91514ec1d85c0c728a2f65df78a89df337d27c54bb2ab19aa4f25a5dc12655a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43768,14 +51818,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43783,7 +51836,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -43791,7 +51844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -43799,13 +51852,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -43820,22 +51873,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -43857,14 +51911,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "a04193becbe4b2951af6be63662d570c3dba78593fb3c78b1619446c044218c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "3c57f969caeeadc8b8a844f011107d7e8e15c8805949459589a114f8172bb87e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43878,14 +51932,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43893,7 +51950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -43901,7 +51958,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -43909,14 +51966,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -43930,22 +51987,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -43967,14 +52025,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "497b0f26b6690cda895ae72ebebbb9a2f304196d69c7db8911a018842fbc946f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 256, "9a5b799859e98ce8f34031d7644a9bace4cc972494bc298344dc984ca367e079", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43996,6 +52054,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44024,9 +52085,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -44043,10 +52104,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44084,7 +52146,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "4ffed82144fdf040ec77bfa44fcd6c07770a0b7a8f92e48ff72a40e84b116399", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "9a4e01b26d82e92734dd9a9d01ede8079e8806ee0441db7f565347db4fc514f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44106,6 +52168,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44134,9 +52199,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -44153,10 +52218,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44194,7 +52260,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 256, "f00cd8c1985b2766807644ad37e10034049d6cf64362eb85107fd46c35be2fbe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "627b2ac38c34b5ef656882a55caa1bc59da0ff9d68ecbfed7e6bf10790c6a503", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44208,7 +52274,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44216,6 +52282,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44267,6 +52336,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44286,7 +52356,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -44304,7 +52374,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "0fb8a1dfd2521f25fa1db0178b7d28ee2e8a1e53d233c4b68303e9d60dc86734", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "90d07e6ab949b4bf1757da549fc01208a5e3c7fa008a105c43cefa91fa247aae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44318,7 +52388,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44326,6 +52396,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44334,7 +52407,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -44354,9 +52427,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -44373,10 +52446,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44385,10 +52459,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -44396,7 +52470,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -44414,7 +52488,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "60dba8c49d0f2f9a7aa945ca9ef52fc0433091712926387419b7223de55e47e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "3c48dfb93ea065bfb1fc3b5e0c4193caf3769b43889c39299b4b7c7bd7e7f2d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44428,7 +52502,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44436,6 +52510,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44487,6 +52564,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44506,7 +52584,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -44524,7 +52602,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "756e8559479ba4d67e8d6d92e47a64e8774bbd43a9107e8b7150323db6ab5803", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "8b1f34836e0d9459464a99402205a13cc3bf91d416eb5061b9b3e76f3e494961", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44538,7 +52616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44546,6 +52624,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44597,6 +52678,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44634,7 +52716,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 256, "05947b85dd17b929cd15da57fa3ad78b2c6531bd038de579cf2dd54fbe98ac04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 256, "5c1143b3be806923ad1f13870e678d73a4ac54b75c779d68c43f9429066b5c44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44656,6 +52738,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44707,6 +52792,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44744,7 +52830,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "711d19e0cc7889cb0b868d16e56e29e8c40d728703d8d272d39e3ced1cd4bc0f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "be59f07d33ee73a7f28ec940cfc05ef783a89d9df9f6306f4cba7cd007de51d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44766,6 +52852,123 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "36e0a578cf64ee49d253a78c357095026d1f5d804914aa246b233f4433702011", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44817,6 +53020,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44854,7 +53058,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "7586849862f48c35dc51ef72de4e3614fddfe4d4e9dc81ac181615f9cbcd93c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "c9b1d2561c959a94b0fdb87bf086963d1df9d6676a7cf1a0bce91b7f162f0b80", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44876,6 +53080,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44927,6 +53134,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44964,7 +53172,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "b322893a56acf627a2c76ba065a3e006f16b4e7ba61ddfa566854cedbacfb330", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "1d6b606fc88ee6fc7fad44d838d6a6c5277cc965c815cf3a0cb6e28085e30300", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44986,6 +53194,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45037,6 +53248,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45074,7 +53286,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "9a42bf19e464c20f02c763dac3f06691dac7675b5fc384fe79b693322bb97b54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "91850eca1bf5b2eaf94e3a46b0ab682e772c656f09be0898f7030a897faba4aa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45096,6 +53308,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45147,6 +53362,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45184,7 +53400,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "da07bbc5373000c6d5a453d46b0f4c9620fa4a88210390c049b0bc00ad0042fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "8f9f5b9287f41103864e9eab684c1618078ad2da274f0cd3fc57797213a68092", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45206,6 +53422,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45257,6 +53476,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45294,7 +53514,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197200, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "9932153ef5ad86fc4b7696506c9075145a00a492c2ace8697c262fcfc0eea43b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197200, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "96988a185ae179937cd0075911076d651ff4341ab52b2e7906bc3fcabbc495e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45316,6 +53536,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45367,6 +53590,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45404,7 +53628,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "769cb2dfc0085f361090c8252efab80500b844aec3a2c7dbd9b0593caf974acb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "3f91cc7e530610714d37d1f4f445240cb2b06738eb36027d69af9296904d6eaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45426,6 +53650,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45477,6 +53704,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45514,7 +53742,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197200, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "d08d72b8f206822bf85906244e4f05effbb198d637c6303cb46d839cda6614ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197200, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "4021172dea380144838afc9aa1724fd31b213e5f456b6f2957fd74d443107ba4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45536,6 +53764,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45587,6 +53818,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45624,7 +53856,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7ba3ca52424fef94ee1169be10ea2ffe86c866aada27f167d1c7031a62372b42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "2195f0af0858d5263a3be244c298c045639a6d862869cc513e71e0f5261842b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45646,6 +53878,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45697,6 +53932,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45734,7 +53970,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219728, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "e939b686444d876f82e0e8af796aaea14da9cbabfabb735ff19f99648461aacc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219728, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c9b760668328173094a04a2b02271a9676f0752420b5ac4f356497e3f27a5155", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45756,6 +53992,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45807,6 +54046,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45844,7 +54084,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "f500b0672062287cf5385da6b8f4cb3b8dc93922faf0466b55820963bccf702d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "b9cf360be3f0bc8949972a61f1b4f3f3345f184b2412a47ba57e63ecf8a59aa0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45866,6 +54106,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45917,6 +54160,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45954,7 +54198,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219728, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "f7fe273a644d76d3de6a6ae9e6105522fd73c2be3208e9aae648b462791c167d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219728, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "1529b73b4514bc42bbc245590453f653d15d4945c318bdc55d53e2a5e7ef99fb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45976,6 +54220,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46027,6 +54274,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46064,7 +54312,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "fe8c4bf3f2c7514381f7d124eaa33c910f0af6a04bb07dd5ff826b9a2a8af703", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "cd1b01ba95e1b642f4dfd44e6d944a6ffd710385e5534687b419b6a0e2ea9358", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46086,6 +54334,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46137,6 +54388,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46174,7 +54426,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214544, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c3c0291634ae7e86f158af51dc8b383431dab655f91e95baf129e2737c51a640", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214544, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "8da9fd9e492c359a1b7c785e1f1d69f7fa438a96ff93e4a883a01de10c0acbd2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46196,6 +54448,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46247,6 +54502,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46284,7 +54540,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "a72bc6a9c2270996a0624aa357cc81234ccd1f8aa236cbe8176936aafc77bcc2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c3e56810270e2d35d3e72cdd4eccba186f559ea4a0a65d8dbc0bc2dfacd6df14", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46306,6 +54562,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46357,6 +54616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46394,7 +54654,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214544, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "6c7c13c61d622ce70ec512272aacc5711018f4293304139f395398c21e40f315", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214544, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "508b9d11109bd267e5039d7be3f3a2d80a64f24345d2a57ff06926ed73aa6aad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46416,6 +54676,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46467,6 +54730,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46504,7 +54768,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "66a517777b4a9912e343155ef8dfbf6d6f71efc209be382e1b0daafbffe4cee3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7160076f2cb8a882c489b5484986faa9e43f8f412e87ce4cb02cbc1b4acb34f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46526,6 +54790,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46577,6 +54844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46614,7 +54882,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185936, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7a6531a59fe86b239952561e418fa58180e47a5562ea58bfb7d59d55cf928781", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185936, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "e3c98d05937fe236de2ac659d3b93815d51a124bcd87620e073d156189a59d53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46636,6 +54904,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46687,6 +54958,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46724,7 +54996,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "e03e22f1da49e9ed9d612496e91f4ecf3d624a6e4cdf1aac8bd15eb9f4f8f030", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "b6e03f7051e85cd83905490dcf16b95fdd8d115b5b173b00bc92c3f1b125039e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46746,6 +55018,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46797,6 +55072,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46834,7 +55110,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185936, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "fb10688b64b96277647f6dcaf802d5619fba6fccd3c4f806213e1717172c21cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185936, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "3aac61a827af266fe0db5f1f105612f87405bccda994674ce50083874ab4b9b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46856,6 +55132,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46907,6 +55186,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46944,7 +55224,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "09f9eb32dc8060da21f95b8ddf523e04f7a66ee204137e29a755dde89a05cc22", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "17422ea5910409c94098a70735101168f87558ce9a837cde1f38482391ad7d07", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46966,6 +55246,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47017,6 +55300,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47054,7 +55338,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "6cd27a06ae4bb374c82acd80ab65999641f69e7cf264d0c881041ba8c7eb4279", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "6ccb4de6f5dbc961a9cdf66ce5f817baafef0d374f8528d6e72f95dbf787e736", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47076,6 +55360,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47127,6 +55414,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47164,7 +55452,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c1df1ee3e72a70b18cea186c7280db2b9f8f6227465919dd4878a7780ba2b15a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "27ccd6725ca48d1371eb9c644547fd7867a1b4222827f0a5b305dbc18f80a1b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47186,6 +55474,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47237,6 +55528,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47274,7 +55566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "1ee92a002680f4e394d5d47fd9c98b17359afc641dc4e928db83ce46eeb50e18", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "da65a901cc1f66d38bdc142b2182ba364669b8a2f986c63ee7886b59a87d748b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47296,6 +55588,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47347,6 +55642,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47384,7 +55680,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "df418684aeaed5424aaa7e00be7d9314046ac93978969f469e02b5ab3b4be02f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7266ee335763dca433893ba379b03ca24356b73fd3d5adc459b5b97bee6aaf69", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47406,6 +55702,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47457,6 +55756,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47494,7 +55794,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "37c01e5b0de6a25a3e8b967d31b8b473abc4d38fd516df7ef9bc2c98359a7c34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "74048d78f165c4718ab87b7d5eaa15fd2bf748c896efb00fe7444dc2a9544641", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47516,6 +55816,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47567,6 +55870,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47604,7 +55908,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "7e766a1b960c9954298a62b543c4772cbbce450c6869c3e1251a90800a107a42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "40455ada6cdd666dcb565e1b8b343d398a2552b5b7e352340fee622f94acb9b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47626,6 +55930,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47677,6 +55984,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47714,7 +56022,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "cfdcd9f9efa9bc71c623d6a90af473bf2bf4b7e292f66d88f255ac1530bc1e8b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "1d1de948d9f3c2ce81c3aa82c51adc504fe005051d772daa78f645312a015a21", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47736,6 +56044,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47787,6 +56098,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47824,7 +56136,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "294765edcf388260cb6dfc0daba7346a01a94eea91dc009bb99dfc23aef04ffc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "564d334dd5f5e2a08aedc01517266b027757e3f75034f281e0be279594fc2230", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47846,6 +56158,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47897,6 +56212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47934,7 +56250,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "3321838e40ff160f9418f33b7770187e9176f0f26fa54c8406498eadfb5932b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "cc935a19a3175c2ce7532824e99c5046c7e5bf911d0d34421d3fc55455c2e78b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47956,6 +56272,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48007,6 +56326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48044,7 +56364,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "45d2b9c2d0897ea2533e861a8cb5f05476e57bfecd84380bd9e1e0c3a9f2f31f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "bb71c40f99f8397a4e11e563ee20fdef16abf4d5654ffe8e9148940c70f8d8b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48066,6 +56386,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48117,6 +56440,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48154,7 +56478,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "4ae229fc250b0b3004c13124172ffb6d395b2ac5eeca8df4f557655d5c2781b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "ba3b54ff4b255d4f18a0f24786e072ac49d268878539f7b05ba58e0b6831aafe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48176,6 +56500,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48227,6 +56554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48264,7 +56592,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "e39b66baca6e3e685fbbc4082d610427f5e99d14769ecf75f94c6cbc8a3295c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "a7b683d3578de3e088a52661c12bf0432c58ba73ae61a760d417e327de9df4ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48286,6 +56614,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48337,6 +56668,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48374,7 +56706,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 191304, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 896, "24dc8f20fdda3f9405c61f05fd5e361cce6ad9dc4ff67fcc4957e9c3798e222c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 191304, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "8fb34cc699c2afd026b97b99b7c98069f6103dba1046cdb9efc85e85ee90d185", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48396,6 +56728,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48447,6 +56782,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48484,7 +56820,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 191304, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 896, "9cc8cfaf654f07c100b1114379471bf4b1e3d4aba7a22b9e245b41170a5d9066", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 191304, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "6cf59d2ffad1abac9ea0207789377c47d68b15e3866329b3982752ae4ea3e5db", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48506,6 +56842,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48557,6 +56896,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48594,7 +56934,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 215640, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 896, "344313c5bf095b97c20e12694ffbbb5df9781951e4b77f93e17d94739a9b12a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 215640, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "4c41fae50baf76c1e591ea4e64dec9850d5cc19fb2cc56e48788c6e4bab2e785", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48616,6 +56956,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48667,6 +57010,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48704,7 +57048,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 215640, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 896, "746bcc9c8c8788c918497857e7c81f6241e156b0c56a88bd67791c097f4b3179", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 215640, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "8137a224a406ab71faea54302da52fcd5062e1d2d5b8c5f1be329c98ec120c4f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48726,6 +57070,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48777,6 +57124,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48814,7 +57162,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "f8de453d3db18b8ba1aa00aba60cefb2830f635fbee0bc48d1d8e4951e93f8e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "669d1f3e85ed7e382ec8ce85c02f89a4b427392dee8b3e379bb9620830c1305f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48836,6 +57184,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48887,6 +57238,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48924,7 +57276,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "eb6e6676e0ac0cae7650e24b6772b1245c1da7acd8f93e277fbbb200a043cdf0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "6ba88963ffd4b393c04fe4c37cebad82d2d03ea735385469133111ce2753505a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48946,6 +57298,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48997,6 +57352,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49034,7 +57390,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 223064, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e9ac99c08d21259fc96c003dbece1d517120371653d4ccaaeb2948b32f4a58e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 223064, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8f97f50c18713800744704fcfa38d723421ef4c9d69ee69c9c560d46868f6349", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49056,6 +57412,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49107,6 +57466,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49144,7 +57504,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "666201c688d1a3b7d3f599a511dbf13e6f34480372780e4f6f9c857a365103c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8526c51cd4d393bc4f0f04cf12ff21f9a87354a5af2687d9d54585f18134402b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49166,6 +57526,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49217,6 +57580,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49254,7 +57618,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "2e04bd5c26ce05537c97b7e0c0362c83da2704220612ea31f04dc3a38e222bc3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "15fcbc55273f88c9fe2109ddb25dc609a755910294bd51cfc297d0d684613774", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49276,6 +57640,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49327,6 +57694,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49364,7 +57732,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "8dbe2863e08fb5c7eff12288ecad419ee4b391e42ae3c3e6eb7a58a95534a413", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a8b87f00a9a88cd55f84bb947670344dca20bea4291b6eebfbe5a574f4b96c89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49386,6 +57754,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49437,6 +57808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49474,7 +57846,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 223064, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "4d8a1f1e2a120749dbc883bd8240b5bc187d16dd6a75cccf274ab23edec83d8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 223064, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7589183f08285e6d80b242657a5fb2ca6adda34b54f8f086fe57375875d8d2a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49496,6 +57868,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49547,6 +57922,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49584,7 +57960,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "257f386e11faa678a59c37642951ca6d7774f9ef2f794b4110be35a6abec186e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7a0b2ba2c717de65e9360ab27d191b154a4cd6cc9917f121ea89bf982acd4cf8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49606,6 +57982,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49657,6 +58036,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49694,7 +58074,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 222952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 640, "e57dc702ccb6087a51d55c71850762515d2b044693171c0416c984931c4ba4d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 222952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 640, "c185c35b7377b0401896a87dfa4ce944f0a0b7440fd278bbf8dd16c6dcc12b58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49716,6 +58096,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49767,6 +58150,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 @@ -49804,7 +58188,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 222712, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 640, "2eca134f8c7e145abbe6325e6b3056d5823d76cdfd7f88942ac35dff4669778f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 222712, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 640, "48089a53fffd0f685c867381d5e2551fa848016cc00384be1df66ec173e2ed6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49826,6 +58210,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49877,6 +58264,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 @@ -49914,7 +58302,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220912, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "f5c8c88fcb4fc0ac488056767f95904425a99e9f98fbd8d66ccb27fb07b98e68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220912, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "2091c5e04fc62bbaa3358ef69f2a084c90fa756fec2ff67b46399b4df8102d28", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49936,6 +58324,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49987,6 +58378,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50024,7 +58416,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "7e6c3cb848ff20ecf6d7303c83f9937428f687c5c0643aaf608d6633f7b92d6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "c1feaf2571010fb9f2be7f979d37ca59abb1be88c2fda0fe27c304cc7cacf13e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50046,6 +58438,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50097,6 +58492,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50134,7 +58530,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200440, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "5bd2ecbbd7d798cbda041d557cdb9549c235450cae9bdd96585f4b978958a2cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200440, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "2960a42c78a8167e09188e9080c6b7647a74ce45a047346c7c16afe5152511ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50156,6 +58552,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50207,6 +58606,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50244,7 +58644,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a723b51c5b52cea003d40b4cc107eea8a181a6119e95e424a984d5f44310abab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a237fb90a828800a78688e06af857e7a63f0b219b44553a5b4897a48d14d5a29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50266,6 +58666,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50317,6 +58720,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50354,7 +58758,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220912, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a105e47a5482008800c677870eba80d7c0396a9c1e8a67ff838205b4ff0b39c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220912, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "96c2880b3cf546ec1b50723208dad43d8868a53190ecaf064ca12eb9358bd263", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50376,6 +58780,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50427,6 +58834,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50464,7 +58872,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "4a720b3fc4cf9a997c1b7b3b8b79a8c11744db2571bca9fe73588d09af01ab95", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "12fac2e41ca2f4df89b67e072abbad569929f9d5abb65bfa870b52577f50be15", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50486,6 +58894,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50537,6 +58948,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50574,7 +58986,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200440, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "bd59464adebf444e96ea92ee669cddf27c03676a830e8de93875bbf2908ca417", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200440, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "5e85308851dc873ef44af28b6d5555b61b91a562f1ab3af8d97b733fb2878f56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50596,6 +59008,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50647,6 +59062,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50684,7 +59100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "f13fcf00d270f4b5a7e3b5944638295f6775effbc68968ce3456afa9c7d646b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "5434d7dd06013c88e5cfbe01019f4a3da1a9397ed3260f4aafc9bde2a497fec0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50706,6 +59122,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50757,6 +59176,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50794,7 +59214,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 163768, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "54ada705c8fd493a56dda3e476f9b172d9a3b86e9537c3d0241f0791aaa5f4c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 163768, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "c72283d7f1a49a605f2aeb88e18a1b9e9246ca0d348feb64bca71c3692a5f292", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50816,6 +59236,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50867,6 +59290,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50904,7 +59328,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 163768, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "cd1f8c02ecbdcc189f78c2f0af276779acad979cf28d939ce47997f8b389c4a6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 163768, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "4fc79fe4b5934b77b3b81db60bfccfef2a0d8df3260f6882d165c9e5438bc8e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50926,6 +59350,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50977,6 +59404,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51014,7 +59442,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 183960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "1f43ba0c6128a56554abeef0dd0a4c2d949911d70bf4d7ff7215ddcc3e409f6a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 183960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "988189d954260b7ed37cec117494b985e1b1b606ad15e524cabbf8f8537c9148", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51036,6 +59464,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51087,6 +59518,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51124,7 +59556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 183960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "e0a5926416bb9cf0f0e7eb592409673173d4c72244e96aed0359b8e5fa67cc60", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 183960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "13d0ef3cb4a531d55ca90221b49d41b9b73cc2f7aace5a45d0fd69e0468d9a8d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51146,6 +59578,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51197,6 +59632,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51234,7 +59670,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222032, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f768813198a55a00d8ed3a12470de2160cfb4e52b4c7e833366d4ce56c69e25d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222032, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2dbfaddede04e7c9e956c809e153ebfb26d666701cbe695898685a1e0122ec1c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51256,6 +59692,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51307,6 +59746,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51344,7 +59784,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e020e38d3944425ca1cb0fb9a0242969206b091ca967b71e5f66e33a92917bed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "ee3e08eba2e3fb93f727394846c6be6b5f0aa81372ab08d6685d0d2df75fd1a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51366,6 +59806,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51417,6 +59860,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51454,7 +59898,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 229208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "4d4bb9c69b05ee457f986d91c36957730faea961a6f1bba62e52befe30c0ecaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 229208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "fc994ac086a8ebeaecffd675ea75bb069943cb3e120417727d4b53f822813f52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51476,6 +59920,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51527,6 +59974,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51564,7 +60012,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "cc8a290a2ce0090ad0d4e6a8847f9b40fe3a1e03813aaa6f98e4d365bc540939", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "82229539dd406d8f7b479a785cfc6ba1d47dff7959bf96eeb8438db74430c8f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51586,6 +60034,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51637,6 +60088,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51674,7 +60126,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222032, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "5af12edc89f39febd2eb67633b6ad47763807f5a3e919bd7a0ae38591323cd0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222032, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "9b183337c866d7fa15d7affe110003e7459f7f18b46bed933460c577a2f657e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51696,6 +60148,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51747,6 +60202,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51784,7 +60240,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e51f7d406cbd70ce63028d2d6c136c5a6cd03e208ca0c95981270c65d53f963b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "5656b81d7da500b4ef2b812cb7cabe3da8767547fdf3f4d422e26bc3810ec7c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51806,6 +60262,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51857,6 +60316,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51894,7 +60354,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 229208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "30a6cc69501ff4824b3d668ce3ac40de124aeafcaa6bd1fb1c4f85c98e2ff9ee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 229208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e2b0fac3661c66711a9d434119d500a2c66cbaa624f8711b9267067f5e8ab52f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51916,6 +60376,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51967,6 +60430,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52004,7 +60468,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b5a49cc431ab557db3d83eeaad12af64295dce6e2e1ae2215b61ddca9303296e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "116610b681524294c19eea59d78d7bc97f623fbcf2c6c025f633fbd319d86553", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52026,6 +60490,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52077,6 +60544,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52114,7 +60582,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b441cbe049fb2f55609985dbde2057a19ebbb54a542e49fd763c0ab1fb689359", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "cb0327a0756b927e9dcd1f2cb0c5a2b4904a6375faaeff648bf4f925849389c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52136,6 +60604,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52187,6 +60658,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52224,7 +60696,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "42aa3877378223c0c9ed379e34d1629e93bdbcff6370513a2f34680e90b7785c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "89e1d2b66e2c728239c5385d978b5a3caa04b64991f10be85db18accceaba6f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52246,6 +60718,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52297,6 +60772,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52334,7 +60810,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 228920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "9d5674c9bc313f19f3b27ba56a6da9a32cff34ca8f859572a31ca72ac7cf3206", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 228920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f8a905b2f5e374a3491a6ac780ec683c5451c8f68c0ec1c6012f287f72a8784f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52356,6 +60832,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52407,6 +60886,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52444,7 +60924,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "fcc43a512dcfe9ed47de1bc99287874b8131197432a6c46394e8180f5b582c7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "48a32f670dfaa8fec069aeda46b3fe796dca58bf75440ca6ade29642a60c7aeb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52466,6 +60946,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52517,6 +61000,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52554,7 +61038,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7153955fcbb178d625aa57d993e60c5c4855d4de99f048e1037b3a15dbdcc13b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e471cb063c4f9212af112970864a4d324ca150bc36c937f3b46b9b9107eea271", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52576,6 +61060,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52627,6 +61114,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52664,7 +61152,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "74c369f693ffa0aabdfa54f1c1a6cdcded710eceacd8a76bcf7e0ac2ad12283d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b5cb1bc045ba80c5f5927361cca5935db2f8efd32e89f38612c1cb6175204f39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52686,6 +61174,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52737,6 +61228,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52774,7 +61266,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 228920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "4d2f9d65c91c37832dfb7b914ad9ca05f601e5d613bf3f7b900a44c56273d119", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 228920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a50ea86fb4c6fe1443f9fee3f3cca712e1f4f7666570b13dd887d827433ef8bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52796,6 +61288,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52847,6 +61342,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52884,7 +61380,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e0f1646866253d44843d167cfb133d3b8f9550953c1f3341a428c7837da9defa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "63a0ecd05f47672619bf0981d59d599a80c15f2482cfdfc8fbaee56c6b238dbc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52906,6 +61402,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52957,6 +61456,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52996,7 +61496,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { }, gemm::SmVersion::Sm100f}, #endif // EXCLUDE_SM_100F #ifndef EXCLUDE_SM_103 -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "7bf96e3e93bd2006beec269543860721faa0c224a0e193155950ff88801b8c44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "6358abcdfed4f5155b6d1711ab34ed1a3112aaee0bd7c68d614ff67d5ae89c86", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53018,6 +61518,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53069,6 +61572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53106,7 +61610,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "48b6c68052d1f51b115a403401a80593480e4d3ab6b9f6989b6fd6493482c312", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "cf46a5e62e6998a15affc809152dd1169d9c107fce8b9f908f9b28e8a28cf6d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53128,6 +61632,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53179,6 +61686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53216,7 +61724,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "ccc02ddfccd84f6713cf001ceb5beb6936a6815033a6142731da158a0d4f6a74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "044bf86f52e4fdcf640d28350948a5dde7c93f255ecc41bba169c2b9952d2d1e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53238,6 +61746,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53289,6 +61800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53326,7 +61838,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "0fdd3d6345b5da1794a8a6d8b2617bc2ff48127fcf112d9f82f77b5ed2510e56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "ca6b2e8105ce8f3a2c411bcff6043745fe454a71cc87d0c7d1132d22851dfa2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53348,6 +61860,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53399,6 +61914,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53436,7 +61952,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "bcce3decbcd6b1b9c69c9273e8763d6388ed4cba02f92be5c05c3d2f9f877a02", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "4823f7b6f5aab0e8c92dd89cd4b9f68e5b42f7c2a96076ffa316b136e00fab87", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53458,6 +61974,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53509,6 +62028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53546,7 +62066,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "3fc5c2a980a5885e94376b55d2d6fe13655491dbd5d8569486be27b38073b868", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "ccc6fa6ac6b4f70fa301bbcc91ade6525e497396df9f6374c679751e468aa0b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53568,6 +62088,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53619,6 +62142,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53656,7 +62180,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "0989f429f113bb2e5b7cc12a76c9097650fd32e074f1c75a5c1a2fc771c66446", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "557f9b3dc6a951a5163c1cd03ba4a81ed4a409f68f5e8b10923c36edc3b8296c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53678,6 +62202,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53729,6 +62256,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53766,7 +62294,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "f076a2a745bcc3762baad9c3480bfa4741002d161ff9c3e131e0c4d630d0ce89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "a42db182d845e4c6a75ddb6579a6a1735e52448763f6ca36589743c1a897f488", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53788,6 +62316,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53839,6 +62370,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53876,7 +62408,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "8e42e235febb0e85d8080d7ca5bf383ca8d1b76843632e93c5e52802713a4170", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "f851c2fd6fdd20af6228028cf440f5313d6349b944a6d054ddb25a750d5b4d39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53898,6 +62430,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53949,6 +62484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53986,7 +62522,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "bf80f622a05c39dc49c10d8ba559c92e8909c572200c431c1bf6aa6a09e11a6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "cb9c4ed3d1b4cbeb6c7120662543cf3bc201030c527c4f3ed77d5bdfbf15f3f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54008,6 +62544,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54059,6 +62598,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54096,7 +62636,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "a23bedd7e910be05bae205bea4046aca5cf74ffecaeec5f846ffd0dc880005cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "4c3879ab73444eea663658ce826093267a4b8bd0acdb67bd9b35677e98a9afcb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54118,6 +62658,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54169,6 +62712,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54206,7 +62750,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "0e4fe19af0178c0a675fbd88ebe90128f5e02ccde10cdcda72a45a767f549742", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "421e2a85f1740320a3ce25452b819506230962e20de5f1cbc4bd34c14010a39f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54228,6 +62772,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54279,6 +62826,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54316,7 +62864,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "6fa1555201efab77a26be48589cbf70824b3722837166f0cbf2f7617b1b64166", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "1d668e6c939995681a444b7a3f6cc916eb047b2f153d148eaed705a845aad448", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54338,6 +62886,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54389,6 +62940,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54426,7 +62978,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "daba14d07fb7c7a6d2284e4bd32596e84d387be3eb98ea456f6faa7a555bef05", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "bcb163f14dd6e1a16814d47ac73de162f63768402058d10cd0d74eaaf11fd7e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54448,6 +63000,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54499,6 +63054,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54536,7 +63092,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "861eba1756780b8bb45a7872087295ab5b0611474e150300be2eed09dc51b5c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "e04a16402becd94bb3e2b058027516064400dbe453475ecb93e4d4d6b5433ba0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54558,6 +63114,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54609,6 +63168,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54646,7 +63206,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "ad62e4f8bfa2bc9b90c839551a269cfff5f6e64a16d33dad780cf5db9ce86bb6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "09f2bd280997995ea84a34c058730515078c2e3bd0c94b655fa8921d1fd9734c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54668,6 +63228,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54719,6 +63282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54756,7 +63320,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "5d7b6a642090fcab9b843752c687ae54203c5c0299997419abaa3d6995b72e12", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "04296d4c571d32fdc83bbb7c83bf091e95d5a4042a8c8ce69691622c5cab60a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54778,6 +63342,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54829,6 +63396,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54866,7 +63434,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "30ddc0e963e6c3ea6a961f2a28c0779b1aa5e18f86901015d68044ffba8e7b97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "1322d65b500daf02b7a279d182ab7947a86f778c36cfbdb6c30178dd1cace63d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54888,6 +63456,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54939,6 +63510,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54976,7 +63548,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "7897a1aaa7a9149bd6965349700dee6f4b3d7eee0daa700ce00691a59ce5beb0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "43413bc1cce52825388ef672b9330cf40542d118f4a0a535c0caf0e0e2e78e4e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54998,6 +63570,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55049,6 +63624,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55086,7 +63662,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "064e961e8553f3b1af22f15b1cbfebbc0acb9b42eca1178ca6f59b5f1d447bc5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "3bf2eb544813ee36ad8b7e851e7b4bf464a9e59baa4f63525d3248c78cb598d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55108,6 +63684,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55159,6 +63738,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55196,7 +63776,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "186ecbfbc612e8fae7c4834342ee37c849bef9113743831005f54554ef33c1e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "200b204bb284ef1540fb8f051941e9ae5aa53e39e16053f2fa2b4a7051affdaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55218,6 +63798,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55269,6 +63852,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55306,7 +63890,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "c4624011ee18e5c2080b53d11e3b656bd375ffb0bc18bfe4cb47086bc1a5b10e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "0ad4bd0be61ef93ad558182713c991bd101dc150a674da53f8296a3cfb610599", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55328,6 +63912,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55379,6 +63966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55416,7 +64004,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "4e156047a3f86312e09947943720da9334d66a34e3a81e62c1653e10017c65f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "088b9c793924a53de401cbc00459c9fc8e8a0329295b7003d38da7cfbc642d0f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55438,6 +64026,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55489,6 +64080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55526,7 +64118,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "f394ea8f3c6cb11d4711669f85260a8b135518cf51ea13b3b8b555fdcf291fa4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "e93c0577c9a4ecb616268b736df1c68e7e4c3be2d4d3da45b9920a0816971a29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55548,6 +64140,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55599,6 +64194,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55636,7 +64232,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "ce70042d1fda34cf57de3bd6653ad677f6922c77a5e9b68ec00d9eea01c600be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "9432442220a030ba762d3945129d42f8a57ade7a3ea9e4b933d1e7598a823aee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55658,6 +64254,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55709,6 +64308,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55746,7 +64346,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "be921d5a381b6d7a5b1347bf647c6c6cdd6d5a8539aa23752c7a01093d467144", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "0fe5f85f5a2097f849d571e606fa4ac1d59b7df15f29ebdc70ed70954c22732b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55768,6 +64368,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55819,6 +64422,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55856,7 +64460,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "0fe5ed379374148e39c3ded179490e9cdebfd0ede5d5d3212c3a5ee790c632d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "e69485d9e6cdef4235601e7477113fce9ecfa7833738cceb8e6f386728cef56b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55878,6 +64482,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55929,6 +64536,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55966,7 +64574,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "86d7bb7a2e38c5700ad2fcbb2b6103233055413de0447bf1a14fda3b7dae1ab7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "e87fe52cda1348701af57103ccb63395f2dc5d46b4a2cc1de8502f753dd4ad04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55988,6 +64596,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56039,6 +64650,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56076,7 +64688,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "848df26c6e4c9850f6c16cd7cc4fe076b350dad1acc5e267a5db2541e66dfb35", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "f26a308ef60f7b13c482b63310fcdd928cd599c60dafb6b0e8fd552f6413226b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56098,6 +64710,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56149,6 +64764,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56186,7 +64802,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "41e53a515977996b9593db892d5a8b52d6bd0caa636dae27cd1cbb3e707cae1b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "049ae2ba3c8b24ec45b5cbe0e8d843cdd0672fddde05acf408c44a34c098643b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56208,6 +64824,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56259,6 +64878,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56296,7 +64916,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "527a5084ec9a965bfa05ffbc4095520baece8612266a51b98e413ea9097685be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "de9fc69b25e030dd0a816d5285936ea4b0b92c05fe1b981df2096439f38318e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56318,6 +64938,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56369,6 +64992,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56406,7 +65030,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "7efe8733abb57b589d188dfd7bc05e05518b8c45dbebb9767e9ad1921c033223", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "a1ffb3f005555e72a8bb6ec16af258aa7c91eb39634e8ffce556c45538271876", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56428,6 +65052,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56479,6 +65106,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56516,7 +65144,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "6105ff9eb131e04861aae6d735592a54fb37415a8497b5fa5b903ff1286c91d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "2ec67b7ab25da1128b82508f8fc77213d575a08cbc6f74f0dfd9999b84e9f0ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56538,6 +65166,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56589,6 +65220,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56626,7 +65258,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "efabcf160754497ad550d50f6ea252cff44302d5f266876ccfa50d6ac3ed2d48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "9b196a2d829cfbe73554028d19d5119c313797543109ce4615b32f760488fe59", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56648,6 +65280,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56699,6 +65334,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56736,7 +65372,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "2c3905243260b59a6c43ac50ea18ccb69c27347368bd49aad3d965323ce9dc9a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "fbccd1747a3a8f89adc54a946da3e45479204ad6d45d9c486911f65f23155a83", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56758,6 +65394,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56809,6 +65448,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56846,7 +65486,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "722a40cf50181813441a446227612607c8bb17e6c70d1b654eb85900736aa721", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "6fd7dafd18db5800efce218fd91638b1df54127f2dd10534fffe9ce7a197a8f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56868,6 +65508,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56919,6 +65562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56956,7 +65600,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "c4a9c09b6b2369ce7b42a3bc1a8204c5d0544df65469cf4e2298f105ed832498", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "af8e04f0129a02665a4f584e382c6792c7229bb5210f5e9c04a88f73c7adb8b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56978,6 +65622,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -57029,6 +65676,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -57066,7 +65714,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "2b55a2f62ca0f6e4a6e4f6f32961bd260176c6edb0b8b0d845a6f9a27c4780e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "3b2d474b0ae75f56d9fa1ccdbff7f40fd3b900907c0b5e7badf1dcf23334b96e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -57088,6 +65736,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -57139,6 +65790,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -57176,7 +65828,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "98e5f4c53c78c441c753180f023a664b775f948a64dbecd9983342e73a4dab11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "7ca885f110e2cd576cf46a3f8bcfa0c6430d708247df1002e51900ae442b8439", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -57198,6 +65850,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -57249,6 +65904,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -57286,7 +65942,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "5fd73c99b3c18a78fbddcb3322ae2550cb5643199c4503d86d35c25d66f20de7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "3de99c245f139ad3299e46db54eabc11dba14504b97b99398b84c029a56b360b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -57308,6 +65964,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -57359,6 +66018,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h index c4c3d9587d4..5b6810938fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h @@ -193,7 +193,7 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int sizeM, int siz if (matrixType != MatrixType::MatrixC) { // When using 2CTA MMA, we only need to load half of the tile in each CTA for B. - if (matrixType == MatrixType::MatrixB && tileShape[1] > 1 && options.mClusterDimX == 2) + if (matrixType == MatrixType::MatrixB && tileShape[1] > 1 && options.mClusterDimX >= 2) { tileShape[1] /= 2; } @@ -226,7 +226,7 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int sizeM, int siz // Create the TMA shape/stride for A/B block scaling factors. static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK, - tg::SfLayout layout, int sfReshapeFactor, const int32_t numEltsPerSf) + tg::SfLayout layout, int sfReshapeFactor, int32_t const numEltsPerSf) { // The outer dimension. @@ -524,7 +524,7 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc // Build TMA descriptor for gmem A block scaling factors. auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches, options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, - tg::SfLayout::R128c4, options.mSfReshapeFactor, numEltsPerSfA); + options.mSfLayoutA, options.mSfReshapeFactor, numEltsPerSfA); params.tmaSfA[0] = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); } @@ -646,7 +646,30 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; int32_t const numEltsPerSfA = options.mSfBlockSizeA; - if (options.mRouteSfsImpl.value() == batchedGemm::RouteImpl::NoRoute) + if (batchedGemm::doesRouteImplUseTma(options.mRouteSfsImpl.value())) + { + + // The input is NOT padded: + // [act0, act1, act2, ...] + + // Build TMA descriptor for gmem A block scaling factors. + // Pad number of scaling factors to the nearest multiple of 16 because of the TMA 16B + // alignment requirement. + auto numSfsInK = options.mK / numEltsPerSfA; + numSfsInK = ceilDiv(numSfsInK, 16) * 16; + + auto numSfsInValidK = options.mValidK / numEltsPerSfA; + numSfsInValidK = ceilDiv(numSfsInValidK, 16) * 16; + + auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideAbc(options, options.mNumTokens, + options.mN, numSfsInK, 1 /* tileM */, options.mTileN, options.mTileK / numEltsPerSfA, + MatrixType::MatrixA, options.mNumTokens, options.mValidN, numSfsInValidK); + params.tmaSfA[0] + = gemm::buildNdTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA), + /*doPad=*/false, + /*doSwizzle=*/true); + } + else if (options.mRouteSfsImpl.value() == batchedGemm::RouteImpl::NoRoute) { // The input is padded: @@ -655,8 +678,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc // Build TMA descriptor for gmem A block scaling factors. auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN, - options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, - tg::SfLayout::R128c4, options.mSfReshapeFactor, numEltsPerSfA); + options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, options.mSfLayoutA, + options.mSfReshapeFactor, numEltsPerSfA); params.tmaSfA[0] = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h index 36c7e819817..c0d9ee1dbb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h @@ -230,7 +230,7 @@ struct KernelParams // The pre-activation scaling factor (typically dequantA * dequantB) for non-gated non-linear // activation. - // Only used when non-linear activation is applied (e.g., GELU, Relu2). + // Only used when non-linear activation is applied (e.g., GELU, Relu2, Silu). // When used, scaleC should be quantScaleC only, and this scale is applied before the // activation. Shape is [B]. float const* ptrScaleAct{nullptr}; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h index e73decab006..b18ad67bfbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h @@ -390,77 +390,86 @@ class KernelTraits } // Per-token Scale Factors - { - // Number of bytes for per-token scale factors - auto const numBytesSmemPerTokenSf - = (usePerTokenSfA ? (tileM) * sizeof(float) : 0) + (usePerTokenSfB ? (tileN) * sizeof(float) : 0); - // Number of bytes alignment for per-token scale factors - auto const numBytesAlignmentPerTokenSf = 16; - // Add info. - smemChunkNames.emplace_back("smemPerTokenSf"); - numBytesAndAlignmentPerSmemChunk.emplace_back( - std::make_pair(numBytesSmemPerTokenSf, numBytesAlignmentPerTokenSf)); - firstChunkReuseSmem.emplace_back(false); - } - - // Bias - { - int32_t numBytesSmemBias = 0; - if (isBiasTypeN(biasType)) - { - numBytesSmemBias = tileN * sizeof(float); - } - else if (isBiasTypeM(biasType)) - { - numBytesSmemBias = tileM * sizeof(float); - } - else if (isBiasTypeMn(biasType)) - { - numBytesSmemBias = tileM * tileN * sizeof(float); - } - // Number of bytes alignment for bias - auto const numBytesAlignmentBias = 16; - // Add info. - smemChunkNames.emplace_back("smemBias"); - numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); - firstChunkReuseSmem.emplace_back(false); - } + {{// Number of bytes for per-token scale factors + auto const numBytesSmemPerTokenSf = (usePerTokenSfA ? (tileM) * sizeof(float) : 0); + // Number of bytes alignment for per-token scale factors + auto const numBytesAlignmentPerTokenSf = 16; + // Add info. + smemChunkNames.emplace_back("smemPerTokenSfA"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numBytesSmemPerTokenSf, numBytesAlignmentPerTokenSf)); + firstChunkReuseSmem.emplace_back(false); + } + { + // Number of bytes for per-token scale factors + auto const numBytesSmemPerTokenSf = (usePerTokenSfB ? (tileN) * sizeof(float) : 0); + // Number of bytes alignment for per-token scale factors + auto const numBytesAlignmentPerTokenSf = 16; + // Add info. + smemChunkNames.emplace_back("smemPerTokenSfB"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numBytesSmemPerTokenSf, numBytesAlignmentPerTokenSf)); + firstChunkReuseSmem.emplace_back(false); + } + } - // Per-block absolute maximum for multi-warp reduction. - { - // Number of bytes: number of epilogue warps * number of tile columns. - auto const numBytesSmemBlockAmax = transposeMmaOutput ? 4 * tileN * sizeof(float) : 0; - // Number of bytes alignment. - auto const numBytesAlignmentBlockAmax = 16; - // Add info. - smemChunkNames.emplace_back("smemBlockAmax"); - numBytesAndAlignmentPerSmemChunk.emplace_back( - std::make_pair(numBytesSmemBlockAmax, numBytesAlignmentBlockAmax)); - firstChunkReuseSmem.emplace_back(false); - } + // Bias + { + int32_t numBytesSmemBias = 0; + if (isBiasTypeN(biasType)) + { + numBytesSmemBias = tileN * sizeof(float); + } + else if (isBiasTypeM(biasType)) + { + numBytesSmemBias = tileM * sizeof(float); + } + else if (isBiasTypeMn(biasType)) + { + numBytesSmemBias = tileM * tileN * sizeof(float); + } + // Number of bytes alignment for bias + auto const numBytesAlignmentBias = 16; + // Add info. + smemChunkNames.emplace_back("smemBias"); + numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); + firstChunkReuseSmem.emplace_back(false); + } - // SmemConstSfBuf - // A buffer used to copy constant values to TMEM. - { - // Do we need the buffer? - bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; - // Number of bytes for the buffer. - auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; - // Number of bytes for the alignment of the buffer. - auto const numBytesAlignmentConstSfBuf = 16; - // No need to reuse the first chunk. - auto const reuseChunksSmemConstSfBuf = false; + // Per-block absolute maximum for multi-warp reduction. + { + // Number of bytes: number of epilogue warps * number of tile columns. + auto const numBytesSmemBlockAmax = transposeMmaOutput ? 4 * tileN * sizeof(float) : 0; + // Number of bytes alignment. + auto const numBytesAlignmentBlockAmax = 16; + // Add info. + smemChunkNames.emplace_back("smemBlockAmax"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numBytesSmemBlockAmax, numBytesAlignmentBlockAmax)); + firstChunkReuseSmem.emplace_back(false); + } - // Add info. - smemChunkNames.emplace_back("smemConstSfBuf"); - numBytesAndAlignmentPerSmemChunk.emplace_back( - std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); - firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); - } + // SmemConstSfBuf + // A buffer used to copy constant values to TMEM. + { + // Do we need the buffer? + bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; + // Number of bytes for the buffer. + auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; + // Number of bytes for the alignment of the buffer. + auto const numBytesAlignmentConstSfBuf = 16; + // No need to reuse the first chunk. + auto const reuseChunksSmemConstSfBuf = false; + + // Add info. + smemChunkNames.emplace_back("smemConstSfBuf"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); + firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); + } - // Create SMEM helper object. - mSmemAllocatorHelper - = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); + // Create SMEM helper object. + mSmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); #if 0 // E.g., // Chunk 0 smemLoadA: 32768 bytes, 1024 alignment, false, offset 0 @@ -470,146 +479,145 @@ class KernelTraits // Chunk 4 smemGmemC1: 65536 bytes, 1024 alignment, false, offset 65536 // Chunk 5 smemRowMax: 512 bytes, 16 alignment, false, offset 131072 // Chunk 6 smemSliceK: 0 bytes, 16 alignment, false, offset 131584 - // Chunk 7 smemPerTokenSf: 0 bytes, 16 alignment, false, offset 131584 + // Chunk 7 smemPerTokenSfA: 0 bytes, 16 alignment, false, offset 131584 + // Chunk 8 smemPerTokenSfB: 0 bytes, 16 alignment, false, offset 131584 mSmemAllocatorHelper.print(); #endif - } - - // - // TMEM - // - // [..D..][..A..][.SfA.][.SfB.] - { - std::vector> numBytesAndAlignmentPerTmemChunk; - std::vector firstChunkReuseTmem; - std::vector tmemChunkNames; - // Matrix D - { - // Two set of TMEM resources for D share epilogueTileN columns, - // | set0:epiTileN0 | set0:epiTileN1/set1:epiTileN0 | set1:epiTileN1 | - auto const numCols = mUseMaxTmemOverlap ? 2 * tileN - epilogueTileN : tileN; - // Number of columns for accumulators. - auto const numTmemColsD = numSlicesForSliceK * numCols * numStagesMma * tg::dtypeGetNumBits(dtypeAcc) - / tg::dtypeGetNumBits(tg::Dtype::UInt32); - // Number of columns for D alignment. - auto const numColsAlignmentD = 2; - // No need to reuse TMEM. - auto const reuseChunksTmemD = false; - - // Add info. - tmemChunkNames.emplace_back("tmemD"); - numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsD, numColsAlignmentD)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemD); - } - - // Matrix A - { - // We use TMEM for A if we use slice-K or if we need to cast A. - bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); - // Number of columns for A. - auto const numTmemColsA = useTmemA ? numStages * tileK - / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) - : 0; - // Number of columns for A alignment. - auto const numColsAlignmentA = 4; - // No need to reuse TMEM. - auto const reuseChunksTmemA = false; - - // Add info. - tmemChunkNames.emplace_back("tmemA"); - numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsA, numColsAlignmentA)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemA); - } - - // Sf A - { - // Does the MMA require block scales in TMEM for A? - bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); - // Are the block scales constant? - bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); - // TMEM cols group size in the K dimension. - int32_t kGroupSize = 4; - // Number of columns per stage. - int32_t const numColsPerStage = useBlockScalingA - ? ((tileK / (kGroupSize * numEltsPerSfA)) * tg::getTmemColStridePerGroup(tileM, mmaK, kGroupSize)) - : 0; - // Number of columns for scaling factors of A. - auto const numTmemColsSfA = useConstSfA ? tg::roundUp(numColsPerStage, 4) - : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); - // Number of columns for Sf alignment. - auto const numColsAlignmentSfA = 4; - // No need to reuse TMEM. - auto const reuseChunksTmemSfA = false; - - // Add info. - tmemChunkNames.emplace_back("tmemSfA"); - numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsSfA, numColsAlignmentSfA)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemSfA); - } +} - // Sf B - { - // Does the MMA require block scales in TMEM for B? - bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); - // Are the block scales constant? - bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); - // TMEM cols group size in the K dimension. - int32_t kGroupSize = 4; - // Number of columns per stage. - int32_t const numColsPerStage = useBlockScalingB - ? ((tileK / (kGroupSize * numEltsPerSfB)) * tg::getTmemColStridePerGroup(tileN, mmaK, kGroupSize)) - : 0; - // Number of columns for scaling factors of B. - auto const numTmemColsSfB = useConstSfB ? tg::roundUp(numColsPerStage, 4) - : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); - // Number of columns for Sf alignment. - auto const numColsAlignmentSfB = 4; - // No need to reuse TMEM. - auto const reuseChunksTmemSfB = false; +// +// TMEM +// +// [..D..][..A..][.SfA.][.SfB.] +{ + std::vector> numBytesAndAlignmentPerTmemChunk; + std::vector firstChunkReuseTmem; + std::vector tmemChunkNames; + // Matrix D + { + // Two set of TMEM resources for D share epilogueTileN columns, + // | set0:epiTileN0 | set0:epiTileN1/set1:epiTileN0 | set1:epiTileN1 | + auto const numCols = mUseMaxTmemOverlap ? 2 * tileN - epilogueTileN : tileN; + // Number of columns for accumulators. + auto const numTmemColsD = numSlicesForSliceK * numCols * numStagesMma * tg::dtypeGetNumBits(dtypeAcc) + / tg::dtypeGetNumBits(tg::Dtype::UInt32); + // Number of columns for D alignment. + auto const numColsAlignmentD = 2; + // No need to reuse TMEM. + auto const reuseChunksTmemD = false; + + // Add info. + tmemChunkNames.emplace_back("tmemD"); + numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsD, numColsAlignmentD)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemD); + } - // Add info. - tmemChunkNames.emplace_back("tmemSfB"); - numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsSfB, numColsAlignmentSfB)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemSfB); - } + // Matrix A + { + // We use TMEM for A if we use slice-K or if we need to cast A. + bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); + // Number of columns for A. + auto const numTmemColsA = useTmemA ? numStages * tileK + / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) + : 0; + // Number of columns for A alignment. + auto const numColsAlignmentA = 4; + // No need to reuse TMEM. + auto const reuseChunksTmemA = false; + + // Add info. + tmemChunkNames.emplace_back("tmemA"); + numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsA, numColsAlignmentA)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemA); + } - // Sparsity info for A - { - // Number of columns for the sparsity info for A (note: for Dense, this is 0). - auto const numTmemColsSparsityInfoA - = numStages * tg::getNumBytesSparsityInfo(sparsityA, tileK) / 4 /* bytes */; - // Number of columns for Sf alignment. - auto const numColsAlignmentSparsityInfoA = 2; - // No need to reuse TMEM. - auto const reuseChunksTmemSparsityInfoA = false; + // Sf A + { + // Does the MMA require block scales in TMEM for A? + bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); + // Are the block scales constant? + bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); + // TMEM cols group size in the K dimension. + int32_t kGroupSize = 4; + // Number of columns per stage. + int32_t const numColsPerStage = useBlockScalingA + ? ((tileK / (kGroupSize * numEltsPerSfA)) * tg::getTmemColStridePerGroup(tileM, mmaK, kGroupSize)) + : 0; + // Number of columns for scaling factors of A. + auto const numTmemColsSfA = useConstSfA ? tg::roundUp(numColsPerStage, 4) + : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); + // Number of columns for Sf alignment. + auto const numColsAlignmentSfA = 4; + // No need to reuse TMEM. + auto const reuseChunksTmemSfA = false; + + // Add info. + tmemChunkNames.emplace_back("tmemSfA"); + numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsSfA, numColsAlignmentSfA)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemSfA); + } - // Add info. - tmemChunkNames.emplace_back("tmemSparsityInfoA"); - numBytesAndAlignmentPerTmemChunk.emplace_back( - std::make_pair(numTmemColsSparsityInfoA, numColsAlignmentSparsityInfoA)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemSparsityInfoA); - } + // Sf B + { + // Does the MMA require block scales in TMEM for B? + bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); + // Are the block scales constant? + bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); + // TMEM cols group size in the K dimension. + int32_t kGroupSize = 4; + // Number of columns per stage. + int32_t const numColsPerStage = useBlockScalingB + ? ((tileK / (kGroupSize * numEltsPerSfB)) * tg::getTmemColStridePerGroup(tileN, mmaK, kGroupSize)) + : 0; + // Number of columns for scaling factors of B. + auto const numTmemColsSfB = useConstSfB ? tg::roundUp(numColsPerStage, 4) + : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); + // Number of columns for Sf alignment. + auto const numColsAlignmentSfB = 4; + // No need to reuse TMEM. + auto const reuseChunksTmemSfB = false; + + // Add info. + tmemChunkNames.emplace_back("tmemSfB"); + numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsSfB, numColsAlignmentSfB)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemSfB); + } - // Create TMEM helper object. - mTmemAllocatorHelper - = MemAllocatorHelper(numBytesAndAlignmentPerTmemChunk, firstChunkReuseTmem, tmemChunkNames); - } + // Sparsity info for A + { + // Number of columns for the sparsity info for A (note: for Dense, this is 0). + auto const numTmemColsSparsityInfoA = numStages * tg::getNumBytesSparsityInfo(sparsityA, tileK) / 4 /* bytes */; + // Number of columns for Sf alignment. + auto const numColsAlignmentSparsityInfoA = 2; + // No need to reuse TMEM. + auto const reuseChunksTmemSparsityInfoA = false; + + // Add info. + tmemChunkNames.emplace_back("tmemSparsityInfoA"); + numBytesAndAlignmentPerTmemChunk.emplace_back( + std::make_pair(numTmemColsSparsityInfoA, numColsAlignmentSparsityInfoA)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemSparsityInfoA); } + // Create TMEM helper object. + mTmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerTmemChunk, firstChunkReuseTmem, tmemChunkNames); +} +} // namespace gemm + public: - // The MMA kind. - tg::MmaKind mMmaKind{}; - // Whether fuse Utccp into the MMA task. - bool mFuseUtccpWithUtcmma{}; - // Whether use the max TMEM overlap trick. - bool mUseMaxTmemOverlap{}; - // The number of epilogue warps. - int32_t mNumEpilogueWarps{}; - // Helper for SMEM allocation. - MemAllocatorHelper mSmemAllocatorHelper; - // Helper for TMEM allocation. - MemAllocatorHelper mTmemAllocatorHelper; -}; +// The MMA kind. +tg::MmaKind mMmaKind{}; +// Whether fuse Utccp into the MMA task. +bool mFuseUtccpWithUtcmma{}; +// Whether use the max TMEM overlap trick. +bool mUseMaxTmemOverlap{}; +// The number of epilogue warps. +int32_t mNumEpilogueWarps{}; +// Helper for SMEM allocation. +MemAllocatorHelper mSmemAllocatorHelper; +// Helper for TMEM allocation. +MemAllocatorHelper mTmemAllocatorHelper; +}; // namespace batchedGemm //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -680,9 +688,16 @@ inline int32_t getSmemOffsetSliceK(KernelTraits traits) //////////////////////////////////////////////////////////////////////////////////////////////////// -inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) +inline int32_t getSmemOffsetPerTokenSfA(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSfA"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int32_t getSmemOffsetPerTokenSfB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSfB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h index d09ffb7f298..8c1a6347322 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h @@ -19,6 +19,7 @@ #include "trtllm/gen/DtypeDecl.h" #include "trtllm/gen/MmaDecl.h" #include +#include #ifdef TLLM_ENABLE_CUDA #include diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json index 51f7d7895ee..2ad25f095da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json @@ -205,7 +205,8 @@ "fusedAct,act,eltwiseActType": [ [true, "swiglu", "none"], [true, "geglu", "none"], - [false, "swiglu", "relu2"] + [false, "swiglu", "relu2"], + [false, "swiglu", "silu"] ], "sfLayoutB": "linear", "useUnrollLoop2xForMma": [true, false], @@ -230,7 +231,8 @@ "routeSfsAct": "tma", "fusedAct,act,eltwiseActType": [ [true, "geglu", "none"], - [false, "none", "relu2"] + [false, "none", "relu2"], + [false, "none", "silu"] ], "sfLayoutA": "128x4", "sfLayoutB": "linear", @@ -254,7 +256,8 @@ "fusedAct,act,eltwiseActType": [ [true, "swiglu", "none"], [true, "geglu", "none"], - [false, "none", "relu2"] + [false, "none", "relu2"], + [false, "none", "silu"] ], "sfLayoutB": "linear", "useUnrollLoop2xForMma": [true, false], @@ -275,7 +278,8 @@ "fusedAct,act,eltwiseActType": [ [true, "swiglu", "none"], [true, "geglu", "none"], - [false, "none", "relu2"] + [false, "none", "relu2"], + [false, "none", "silu"] ], "sfLayoutB": "linear", "useUnrollLoop2xForMma": false, @@ -409,7 +413,8 @@ "routeAct": "tma", "fusedAct,eltwiseActType": [ [true, "none"], - [false, "relu2"] + [false, "relu2"], + [false, "silu"] ], "usePerTokenSfB": true, "useUnrollLoop2xForMma": [true, false], @@ -431,7 +436,8 @@ "routeAct": "tma", "fusedAct,eltwiseActType": [ [true, "none"], - [false, "relu2"] + [false, "relu2"], + [false, "silu"] ], "usePerTokenSfB": true, "numRegsPerThreadNonEpilogueWarp": 56, diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d33e0cdffe4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb7cecb5f5aca5eb6d15c9a38332e1ffd6e20ecade5be5915ba4e9fa4354b7e +size 638826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0976285a912..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb1fd74f68a4fcb5d81622999e54c5401510c42c866c8dcd11817b895172dc53 -size 615997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8495a6d3fe4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea382dc47c588105057f170e83dbff54e94f056d89083cd49b28bd100f52dec9 +size 655208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c9e0cb6d739..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccf78ca00c241d696da6de81b5432fa512707aaf389ae48e0d042fd1ec482b2d -size 632430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..124cda881cc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c51fe565dfdb33ed64362558318a9f9a8146c69535f2112dd1e14e95545528be +size 578533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f8d1126af2b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cfe9effc7a2a218f7c745e6c4d0440776b681ffd44a07548c0d22bfce9102af +size 451241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4bdeadc5282..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:45a0a537039390c5b789006c26a757de70bbdb5df05e42b799c60e49201da3fc -size 579287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 44b7fe50050..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eeec6ae6a67c7278dde7dede32b471afc97f4cd5622286009a12e6b472f07793 -size 451995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0e10d3f129f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ba6731622f1b911ef1c2afa58ffb06eeb823e0178ad8f67bb08c3de542bfb1f +size 594077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..41d79ec830b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73958a14b75fe6e61f5dbdaa93e1e68daaaf6a8d07cd1b66987daa71e9ed38a7 +size 471175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f7930eb0b5c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:915ab441627dc42851d991becf9995a166b9c53c08c5445ab7c3a72fa304a5aa -size 594831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1040d856cca..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:41de4188d47a261871b5886acb9f9e9dec3cc37fbbf352d6a5f7a657cd0eb5d4 -size 471929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f321dd4bf0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a9569f9852014cd18a9f44b57ac43cb07977fdb0d2623be567f45845276da76 +size 601375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e7cd2fcfdc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad514fd908d4db6c3654e74f438b97d522788dce1216f0d9e13af0c325d372b +size 466879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index bf59881c1a9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6098363f9f22fc8ea7771456994fda24aaa0ac16ba53c42755271ea36f03ffdd -size 467683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7e649bae518 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3349134fa1c86d6d798ef3e3e60fafbba5f79afb67904f7081655652c9c455c +size 629006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..54f158e3d15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1554a195a73311ba4df80c2d9cf6bf5f6b13c7873f0fbc5c53d537c531efe2f +size 490811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0e8ec984d4d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa9d2ae4389fad4928e80f500de6f126d17bba0509324cb10338283aa628eaf1 -size 624776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d7dc73b03b5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8a1dd1587a7f9e291bdd2900b1abc1faac73d7b2ec4a740a545f2737524fefc -size 491613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fa95a22a560 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca4c12e592653e3d11821e665b0ecd24cdc470a317453677e08e7130ae235c0b +size 655950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1e00fa5e1f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18d6b5682f282804745e0ab137f3d220d03ea9c54609f605307c40887dfff336 -size 636082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..21870ff6ffc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42b2b33180920229a9e5d243bf497005a96a3eb7bc0cf1e5bb88279b687b10db +size 585193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..76261c176a7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6dae14cf5f65b8e8300d99ef6d58150d7376a38c978746d4ffd8c47ad72686a +size 457211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1d4046bba1b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea621f03e28405db0b937e2ca88d8ad0b86b372f7b594fa0a250fd60e1455602 -size 585947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 662c10402a6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cfb3b766e3260d8a80db56c9dcc14bc3d04131bbdd457d1458e8121988efc870 -size 457963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b47956b691d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b45d92fb9e725319e60f06e6c4ced9713d8f1f955cfb867f08b87c00465eb5b +size 610703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e94dace0fc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f85a0c60728a826bd5d0cc58f0aaf046e7aff635b3c38b205fcb3e375e31e2 +size 477145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 28250f24621..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34624008e9023db6d83888c7209fbb7313752ae3e39ad26c33b88b57225229b3 -size 610667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5db9479afe8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:84762d89de631a07c2a41c45db4393c89ed01e2bd2ad9cb25f857461ae21fa07 -size 477899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c02ba172fcb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb841ecd80a33d1c20e75bed5168d357d164afd80edafe34b3277ef08ead1d91 +size 607047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..02fbc98d02a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8ab776880d42f7e1f08950e844e0919ad79e553c82911edbccde83d1d944515 +size 473343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c3169457da6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:acaa0155353f5b04d7e5e329b18527b1fe35961b31955fc1f3965c46c6c50abe -size 603903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 855ef7a62a9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20d2ef5c80a365ecf88aaa9b287b42d3e29afdb4be9f1ce3e529368fc73f9842 -size 473355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..972d0cac9a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86f9463b72c9c4aec4a8a32a23e17411e1a9fe7b94e923ad28062177a752b004 +size 642030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..744573583ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e123b5c316a726057c256476bc3a138c9dae9e71cbfe1c480f94b6b6766e92f1 +size 497273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e3a44e79bc3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:533d0d814b32c4cc0d05c8aaf5eb8e413e1dd221e98462d15518c5148ecf59d9 -size 641748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3c06955bb9d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:592494af114e214a1cab7168d6abae7c9a6b7d16408cec5ba688f84afc2b827d -size 497287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5f5be74385e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cd64ea97d93276a40bbe50b0a52d416e8968d29de744c6980ce2d67ca3792d3 +size 613855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 308582ac8f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:061c93467482bf760530264844132b9f355d5624302322ba0c4946680de3ce73 -size 611551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d8a36683a19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a34b73c0b789153b16b7ae698fe45fe93cdd3b4da14226e529144142ffae238 +size 648888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0398cb6bb2a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e45ad66a81365cb180108238853245a75e03e5de187faf3c804a1cc8f576093 -size 649344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5d2f11d57e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6cfc7b90e119c99c6de710ed96f2fe4b8c754d75e10cbc64b62611e114859c +size 573643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2b2785c24d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2887c5c37527734d2dda471a8fc707409546fee53da37cbea3b48d91bc0ddcb +size 446253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c1530897592..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42fb1c37b79effa0dd232d0cf7641ef88d8bd55e4b34ebf164b471a444a0b54d -size 574397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 71bc93969db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8bfba5dc16103504d9cf8f69a04c4361408b4cf257f27d26f5dbd922eee0c79b -size 447005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..483953fe770 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae4046de1849d5198fb24b40b88bde8596e9c4928e61d961349ab0071c7e138f +size 590617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1cdbfd65233 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f443d316764083178f82bd8f098ab19d0ed8c88ab7efab366cc27a96ff8518d +size 466237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 10456f75aad..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d51aa778296d937bd180895b23830225e69453b09687a9a98be480a46fc875e -size 591371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index a7b33515028..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94dd9f4b5e4d3506cf2315c6be9c564d0978089ab8108baaeaf243e779f0817b -size 466989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..c86a1762bba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e87c349ca7a317ca04425804d09589ca39bc1b67c81dab90d379abfc22d661f +size 418419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 3e968240a2c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51c4bc95ac4d760b25978c5e0497d2d3df79b540dff4a30e28896f7485dce5e1 -size 418383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..cfc20d04aec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5126a081090b8bff88cccad0246de9b365ce707c286085470059371369bc71db +size 560421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ff6c1be09e3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11a02b2d1a372b97ab1e115f612d623342e0a077c2cf65bd8e0f431b271f229 +size 428097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..45c9d0ea0f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30825da13af475de7c4adb1b7cae8bddeaad2931c90d9b7c7d6da813bcdf894 +size 656036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..88397f1ce5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8438c3ede14e5b61bf4d674d4d27baf0755ae39971a6f20d99b505b23961196 +size 657172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..535d2930910 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba4785741a4320bb0ee42863b02950efc9ad329f0dd0c365399cca6f8914eae0 +size 658356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 946cc032fb3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77bef45dfdd275181aecde186b7c24593f001df654982b0bf8f5961ca71b5df2 -size 561175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 85ecbef5a4b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0778845ba447848e8eb44f9543db1e3cbc6e86ad82141f04a400c0c8aabc583d -size 428851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 43bfa4f7f3c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef1ece9b60a2dd08e3a9b38e883df5f5ace71c5e834f1a598a160bb6e87095c2 -size 656790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5b86d60e468..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:712abc472a9b489d4cbb6bfcb165060723388052d97a0a07e222aba7c91758e7 -size 657924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4da7c49629f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52072753e812e2a3a68f32abb9080f348877c17765edcce0bf69e2441fbb49e6 -size 659108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..2766e72e243 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1baa59ac004c9765dbb3b915ad805431b85176d7ca50f8ed2db4a66172a265bb +size 441559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 24bee6053a6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:65fdda458d21c964ca4c05f8bf90b0bd73149aaa898c852ad905940feaf8ff49 -size 442313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..383ca9a550a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a67a0652e835f788a483a2bdf1f8a0909d49f7f66110d3d2c8d7a2d306b6f86f +size 575621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..39305e5c097 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:385b4875668f922d268d22a2a7402d3aac2ea17e848286f37471cd9a4ee02078 +size 451239 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d9d8760183c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b512a7e4b9f7eb038d00a44e29e0a77da2b33aef0c38c05eb629487f17038a85 -size 576373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2b3794c1dd4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6ab7fee3a722bdf870ac55a08ec54ec1461ad1ca3a023cff738c60266200f76e -size 451993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..02b51367afd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c4826e716e12eb26aee266859c8cb4e21a21d7c4fd40ea28c3578f79a150da8 +size 545931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..44d2d44cf36 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14208d8916b0c1d5ff22bc644315abab7ea8f0c9626f5343c873cdf911c67591 +size 550421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..050728fce70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d702a9b1c32f7ab2509e4da0211572f39cd39f45f13fe16bfc1e6af31d23457e +size 432503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..c40f22db259 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2689b9b5e2bdc335a63b578a4392a758690774f51007d9957c2e38c178f88f +size 436893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 19158427c41..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fbf3f3db4becafb009519aed1c4f740bc42d9d37929b14ee05a6696e6ff98e2e -size 546685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index a59825d8189..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0fc554c75bde4b75231aad2d565271ea84b824ab5bfd6484053355fae59c173 -size 551175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 9444917441c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de19256937a756fe7007aa526a1bd9ef8b296c131c35475fcabc1f684dabba27 -size 433255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6d6617e5293..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3bc81488c7ee1284f49a4b0aba3382a7ccc1efeb5d15dd3bf1d10bbc30d4998b -size 437647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..fc26fb52b28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9806419d50168d5a372e88e3ee3a9e80b57893c90d6a2fc7014798871c9c2fab +size 560143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..b107e9b98a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e161536721381b6631b53b45548aa2fe409c6cab25114c1344a19735d55e593 +size 565669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..3ca068577e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d0480602f1f043d53377e4bc0d772b6d782c9ca0215f6729335663f2f17aa32 +size 447307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..1acbba931fd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a4bd122d51aa8afc1f34da2e81dd2c5dccac2a20e6da1b9be4c8b092fa662fd +size 451253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 76dc52a3d6a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edf8af121cfc9f4476b98564b81d8b1f22963ca3246f070f4f8ecd054b3f8d1e -size 560897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index e7071e7aaef..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b5cfa916c2e6186d9bf4c24bc67d95d1fc03e9ad0bbc34719b4b0ea52379450 -size 566423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 20bfdba29ce..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:625081524078be6791e4be90f592c8942fc5aad833d25ee2bbb69dbe4a7bb841 -size 447271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 086ff51d788..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9bb34bf536570a2dc5e0fbfbd386f432570723e7a7126ca2858dbc706d13c301 -size 452007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..75fe6adad69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0533b165b3dcd375b077ccad166cfe191e9c07e5de37a66a2d34ebe30643b3a0 +size 561719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..f7070e0b0d3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2aba3dff5e085b85c767a08374306230e878663bb4401dedfdc5f5128d30262 +size 566159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..4f6b770770e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f60fe2b78985c0723fdc42b9d6e4592df21260f91786e451630879aa481419b +size 440149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..48ff34067c3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93344a885c72e62a47a53035e5bd9233931c8d0c57b6d16c3843f073e2c5fbd9 +size 443751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 8136363cb77..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2c38c951255c24d5268a16e8e8fd3de0f3c8f8c1177e1bdfc717c10528ac780 -size 561683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6dd495964c1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f8d54d46a848333f81448f21456ed75b253ba5838c1b13d52bbae906d748156 -size 566123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 7f05056eb5e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eff08783d0d4963e287cb02ff762450cd85aa1452c5eac81c889537e595aea9d -size 440903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 98be6f7195c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d971a421a701c23c89029cf3532dd251b1c18d3fe1790d9333d21f1dbf8791aa -size 444503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..505c0ca8e9a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e949af7809e67611eb8dc676e02d5c59c7a45cc2751fc5d21cc5d52be69dc09d +size 575141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..2efa9c52c6a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a84c1139d849c7fb0f85b01df09f274d910d21d9ee6fe49bedfcc435491f0181 +size 580667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..691c3609ece --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:207936bac6e4330977544817dd122db76e0a6c863f62eedd6be6b206c4cbe579 +size 454163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..31cb454ec50 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfa3a8b3e32be0fd3860b899a87e7da8bda61faaa08096d094815ea06800af5 +size 458111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 16d793eac05..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1dbc6cca1ab7d2338b4e89d3c702757ec7808d1115c093fee8ac5d148c3de883 -size 575895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index d7b86f9b8db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d0595c5715107c1d64b54785ea667619a29b360d16bf93708d78a9ac85c228c0 -size 581419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index d585b9ecab7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:392704fec72ec707b11591aae55dd51dc494bd375f8f050695124f980ec1ddb5 -size 454127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 3abe01c45ac..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:44b8fe800e480173748c51228d6a0f198374c8ebfd30e90644ab5ef7810c2ec9 -size 458863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..50d7033327c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd12e6728e365bb6fcb0071577e536d6d6ab338e93b120fa039a7cbffff0196a +size 570451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..6d89b190633 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58b8dbcb022dbd6bf2bf6716a706983ec2232e27f5cb8bb8cfe3e4b7aa86f7f +size 574891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..73768530b25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4bde05d35b5f839023ae6ac1dec6164832dfde41dee98a3bcdb1d4b95ed3f54 +size 448043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..4bf3397a2b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9c9f7454e677cd7475f9a11a04d72b61013c45be3b5cc28fe5406df8cd15c00 +size 452433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index bc2bca9c7fd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:434f2c9bff5d09584988fdc7767040ebfaf66b7d3adde93deb40128eec445c22 -size 570415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 3bcfb977daf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:455a0d697086956d618b47bcbb4214de137b0370287ba59f3d8b8be51b7c6aea -size 574855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 1f1558be1a5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92aec9b1af3a7df113d4057de4e5be676ef2ccd38db45c31b4a1b05d471a2a97 -size 448795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 0e8f067afbe..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8708594caa6e5c11c990c6bd699ed96eb5bc6d660f6154613490844ab9f9e3a3 -size 453187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..751e98eaf77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce1b585b90a35d45292cf55b41cc1f796677a4bc7f6652f3449298c7d4c6365 +size 583873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..516381bc46b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba92d23642f5776c85397e06f3f7fba156b2a2d8f6a3c24df678d13fc55a387d +size 588609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..2cb5488c032 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8051a6495f08037e8342334643b8486594d93891f40cacbdda24f39d2ae8c2e +size 462895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..514709977dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c447a5abb56b99fc57cdfd99f73480adce6555d18a4e02ab37102d170e5be1c2 +size 466793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 7ec692847b4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62d59babf68d35ae157095f36185a75df80643322a73f1b24226a061a5f5f6a7 -size 584627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index fbfcbc9ba7b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f137fec086b93d5f00d7d526bacdf86e6c2eae4ad2c4067ef12213aee8b0c3e -size 589363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 266f1f84f39..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b39631e5dc772df53170af3b2c6aa86a8ad77f0c84bf30303d64c2312cf35f5 -size 462859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 295ae4d6e57..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75bb517365952e4bd92e485e45602d0f47cb4768d69c47fb07006f7dbd9812bb -size 467547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..0ffb0071b44 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2032b826174b423372454468569656bb5b04f3df9dda3f62bc1d03f4346e4345 +size 541979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..757198837b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a8846b5b980a6bea8f80bc447afe5b52bdf0e83691a7e1be268538b6531f598 +size 546419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..794debc5643 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff8688391ce02c6661473b3af8612a94dc06dcf0d8145d25528887de885c42d0 +size 428501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..f9bc98d916e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6776f4136629119f937dba02c948be81d7ce06236a5e8bbe237312e2a5d2da67 +size 432891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index cc4629fd381..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f810812c255bf32491639d1db9545ffa22a954e64cb27564389444c922fb5250 -size 542733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 323167d5cfa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:afeb850496642212d0a128c8d97823e004627dcb5559c62641ee3d243a73fcea -size 547173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index e4ac36abca0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d1bc66ef28cbfdfe41aa625a615a0700004b5fb1b8884c5bb14563f7377c63fa -size 429253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index cde55a40abc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:56796506e7651d1095356b5114ad21e9bdadb58e238f563cf8cdba8ca3ca99ca -size 433645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..848fb0e018d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:104097a5c4e9ba892135a0cd5dd5c3233ed6640083ab53c549ebf3966d8ce9c5 +size 556191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..957f6250772 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08aa2aca8a0c4cf07ae157649888e3cd84c294613153c2545e76e74fc592cb9f +size 561717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..42489bad637 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f1f673237c5dd55736cb624eba8fc5643711c497baaec5297c213c84ffab06 +size 442565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..bff68524168 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:244f2554fde577fc09f9e8dd214bb42739ca4bd558a63382da91083203bc8d7c +size 447251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 1aa9d4c7f08..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ae2b32383b2631977bb28a0397d3827f9ef067a04557f28aa64ab9f1523977b -size 556155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 76c8c774620..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6efcbc01fba3243348e8a4759e68a4044d327916d88394440de3c4e5ae3cfc55 -size 561681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index e4b0c75cad2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b994bc5aa6f02d20a47db3e0fb246cd0d3aa6b3d547684eaf32167d30c91980f -size 443317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 243491bf7a6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fcb8441b92eae6ff2e2871afa77f1041668c6f090f53626e4902d4df86cef269 -size 448005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..f049971c8f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24edd832608c1f0c08a35668093d38d5304fa257647fb2bc85a4a478e8e6a661 +size 625402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..22e54ec1037 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381596ecdc74c5c434f554f82d51440b51f3b22b68275173f544539212787465 +size 629842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..14137471ef7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b6052936ad6f25c8871a7c5c7a0b0b2d46de267135c430b48b99651f3c4c8c9 +size 506349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..f94e6fedd32 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66265471695bdc5c532aa59176ca8accc26b16e5ef9e7f296ee9aa49bcea63b5 +size 510739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index d2301a839e2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6c8ab4698f921ff7fdc93133e1f23f1f8364ff4133645bddce5aedc4ce703ede -size 626156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 85dbee0101a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6c073ee3dce02df219794100c4efefc0d34c197a41a4e592ea187d15214b7e8 -size 630596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 52b38483f21..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1b6cc69a38e18748e826c04c34f993c95d26acf4a5fc48f2ddd6f6e6ca024f1 -size 507101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index b2ef9f684b3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:711a6742507e4650f3a8edfc5bae1fc7177f343e0da79c2b7ea971b82dd587ae -size 511493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..a6774ef56f5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9423f8748d320d20e31107cec05c8ba040d6f3e0417a4dc241344c49721befb +size 644548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..50e51358a08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d6404619edd0b5744749e2a9bfd12d39436675c5fee4f6ad80e6f635b5037f3 +size 650074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..edb4eba4950 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5956a9a0c52029acf9691bc3f80d51791ad6a280d44408d62f173b77c41729ad +size 525937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..762b3123e41 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99b2c3c4272d6eb463bbf9b35c4f12b973e2e33bb131f0545a29de78a150806 +size 529885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index dcdb2c29cdb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8014b58215a9231a508e4f1759a3c42707a8de2e69aeb6947ebf15b79a3b467d -size 644512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6139b317e6d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:24225dcb9c8679eee8f5fd1ab8a48315b0e1d2bcfa9713a7b75e2f3a8a4cefef -size 650038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index f93c8544847..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9878ea41d673bdbe3f0f0143f0dc7bad8fa0408f8c5f23bb62c8ce1118fdead1 -size 525901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index f319e842023..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05d616e8dd95ff608b3bc4e712bae99b02c3c89416a0f0b1301d6a12c1dc98a4 -size 530637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c2da96ed734 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b22b64a96ba1404954adcc4482641c6e1eace8a49d85de1a116840296d94ac7 +size 670292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e564639448c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:094dcf85043c91b0dbf0198243f9cdf75c904bf4d3a85537654f82c33bde2bd4 +size 538461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b0e0f6542f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06ce55e1ffe8d7a4f81fab7b246297271244a19d0e2c1b092d05f54ff33eb31f -size 670158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e65f7d24295..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ac195f1790cb5d78e192a715c9655f90c5a6dca6a456195f5d6f50fad8e440f -size 538325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..acef1f647a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b821d5f9f6abdcc3c1493f5ecc3df02ba2f834b63d739424a57d64be3bd128a +size 484919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 672a345e6c4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:306013dd81e74a9ba24a4bf4285a9906eef6ac24088b4b151888b1331d41ac11 -size 485771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1916cadf160 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20d7bf9e45059e2a40ef01e7aaa9715a78fb3cb4eedcffeb66798425e7481e47 +size 693138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ad1a0b3cdc3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4b2478f90513648bed595aa9cb6dd8c617b7bed2374fac5dad8376c8f29f96d +size 556323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 32e3463d787..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4930edd6e2c0b67e22dbfcbe27d3610e951085edd3a0a0bd4bee21c0ab0901fd -size 693792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index dc3c83ea3f0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:121e71dcdf3179b34971c88fea459903a537756978362deb8e8569a30350b9a7 -size 555399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fec4e7e4d71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:183f554afcaa22c7a5d989f935a2e8272675f490ee7806e8533f563ab79fa4ff +size 496467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4fd6522ea07..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0cd6c1e7705c205c7f143c0f69c263f6081eee9a3ac9f38aafa6c4113b89f4db -size 496481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d652c6b80cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc9e9f5be4b80505e9da1e9709fbab40f31061eed7532137392fbb6ae6cd591 +size 547741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..644bc895d12 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23b44120b7a5dca80a51b071cc4bdb4d95f9cb7933d0502fdcc0ec3c4bdd700f +size 420893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2abfc94361f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6783a13197b6c40eb931714fe0a90e42980658740f399d5745820b569e3f2ee -size 548495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 78aca623a61..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bca579bfb1ddf3ed3d127a36bc284020aaa5b872edd3a34569b5d6dfc7de7553 -size 421647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..55757488e21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb385459078b9ec461849292a056f55b3edb91a265acfd0c66ad6cd386ea3f97 +size 571475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4cc7d21239d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c12243ede1fbb72fb7b6220c1b83c23b1324df849050114301e4fa6d23ef0fb +size 438755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 97fb6f78e9b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e1d99f181da39a8fd23317fe12b37a4e2b3d5108307a4cee2305d35036eb68e -size 572227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 03d62276efe..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef7b37e3fc75809ea83c14c4c2093dcd0e407dba32c1802944da4cb529d18add -size 439509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f6faf5183d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc903e22c31ab42f40c8d0e1daf90283830742d8c9650ae76206e2c95eb4d3d +size 412987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..eca4b585c51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a16d86de14bf5479022902d993841f41961c8b586f554c1c5ea06d7be5a03bc +size 325705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 89b5e69ff5a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ac54bf2852e9021e8ca6491c05e1f458843ca7fd300ddf878ec4569a3436506 -size 413741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 27fa4f535d7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2264843de412c18fda7066593a70759af5b28c522bace012c56b8acf6b851ca -size 326459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..77ff514606f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50bf1a9d178db2ecc37959f02e7ed912ea5146eb8d537acf8a3b962af39b222c +size 427595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..facb752b6ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0031479beba178e60bd7332ac017e9548852853d7231f9623a004daf93652500 +size 339473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 60b47774104..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa6608351afa56eea924104de65fd93e51dd36b6c865b1e68c97266e72247a5d -size 427559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d24bb99023c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:60357924aa009f3945ef537e2256eb954fdd073ece44173e6d34f8bbc0a091ff -size 340227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ed3c98c2abf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcada9c0203c23939531fcec86f4b9002bc5655fa44928ffbbcf5e3f118befc7 +size 513483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c8d5c008093..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:29ba655439149f2572754c900d5d2f7003460dd0b8d46a31650b682e4481f2de -size 513547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e7e8b7219fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:184571b0ecf23e93c4fbe441c1ccb69281b072354501bfb64be8fbc2bfbb13be +size 524883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3779ecfcc94..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ced5de0a8a7a0f914c5cb36089d820d4b71f581d1eade3cf45e5b44eb14a8e18 -size 524847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..13605e3dbfc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd61e4fc47ba3a93cb31e7fc8594fc96432660dc314033ad144ba5edf6443fb6 +size 484829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e11c7941df0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4a8cd3b1898e06315e0ad579e9397e686ae8b501214a26ebee05321245afca59 -size 485583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3c0732362d4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7923186a911cabfb881fb348a4e42f731db4a8f8fe5cb7967a0531b3d8b265d +size 497067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index fe84f293b57..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fae903a15a99aa66c11140c8c9471fd4377e3edde3302e4a9dc9e3f4ec802b1a -size 497031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c3bc7f4f5ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa30d15989c3c039c751aad16c9451f81eb6e7ec96a0b40cc7251401b0fcccc +size 564021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ca4f56f6e20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b7f409c2ab41ec061e3c184c31afd554274921715985eaea515f79409e25e3 +size 438407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4860cadc476..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a378ddaa3b3de8769498db71494c8f75c0d946692fa39838c721fad00894bd6f -size 564775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2374bd7d53a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c003cb50ada5f5b6e5c336d5640539218a2a59a0cc8439eff9b78ec10d9c7d1 -size 439159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..56fda87e615 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2470c7066d54b0d8d51b4268b58270241e3dee5d9b71c0f36cad76cb65fc5006 +size 587755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d6bbf444bae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f24387b2703f8bbe792a3707dc2f3f4af0fe30472600a74a1657970602d19f5 +size 455479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7dd9fcb563f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9160577578207878e3dfb0e0e1468258824ace9337878b6f97ccb5284c12102 -size 588507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 84e14e507d3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0b85f47cbd555e3420322d19cedbbe49a52886d070de9d7f1c303205ced14904 -size 456233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a50baeba714 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f436028f7735e126e206907407b1e3589b4f22916465b9fc43156a5c2a354b7 +size 423051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e3053901444 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b9c3baa06a2a23cb91d0e27ebfbdcca8d41923faf445c346e4e9e0015104d83 +size 328961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 81c37edf57c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c89b16b081c425bc1cfe97098f23eb0c3862a9a522558a84eda7d0d0c2e14639 -size 423805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..deb410f8e25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf61ebbdba444584c4743a4cb4a44bd738c35c490ed874bde8d4bb6f3300dfb8 +size 436919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8f51a8dd0e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68bb1ed9a09cee0b5175124d572dc24370cbfc52606b441501006ff2c27e259d +size 343567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 06774ddd57b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:483ab5a8d320ddfc600f079c2d39b38947b423b4c86d038aa65c3ac7843fb448 -size 437671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 06c9fd08398..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5c1f09620411a515fff45889e63ae577c5e1d949d54a2ac17445039fa8d773a -size 343531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..227239ba34c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98d5046cee8334a562761ea12156bea4be425a67262e7934c63c644651ddbb06 +size 603783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1d37cb27934 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db62cdf42fcaf916c8c1e72c8f3159763b896d999aba58770ce4ef398f73cf51 +size 472939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6267e7bd5ef..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:805730875ed927d20b321eb3f8706500c370b0998c00ec32dd820a26bedd035a -size 604537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e0670f11c31..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36985fb0924a3ceefc886e232e1f5c275c7e129773195a789124e64f970bbaf7 -size 473693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..de75a6ded35 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54fdb80299c765259fc616dbe46c82a7733607e00c2edcf246eb60658524da4f +size 456253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e0441bef6bd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d576a825a9a19e0b7f8844f7a774468360d501a23e390e2b5fe782c44c4a4996 -size 457795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f5d895c5371 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cae78cbd57159b5298ad284c170bb130d7d8347390262e5ea7ada2aead542e7e +size 628308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bad12c1a3ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97a3370e4e68ee43f14af56ceb154fd03d9c8c8fca8d183e83b66a2bd03a6a02 +size 490013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5089e95b00d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:386f8638a1bc64a557b097af19fddf056275ffc7643c5cf169de4c034505ff2c -size 629060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index be5e6f05f57..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e84136047f6d0ee70c766bb3bdfe7c4223d51cc220b9f57d84195e43dc1333c -size 490767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..89452fd1d0e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5d421133702f270074abad0f35b010daa51037abbbe117987e1da656d7f4989 +size 468591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b15fc82e575..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:43fa257faddf2b183d603b1ca8ef767ab4349380912234cd2ec387732c6ed77a -size 467765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..f571faab398 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd389bc4b88b1e94cef259cd53b614e00063e56fc4489e401f2badc729ef6ead +size 401095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 5faa70fd286..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c3223e4c30c769c9c366a76a295f81090988827f692550220ed2cc927ef2897 -size 401059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dba8cf29e08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e536c8c0ea606ecccc270a346da9c925390af0c6b5c5e6243efb27c905ee5d21 +size 543393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2fc5ac70ef8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24256fe63c95179802d5753314eb546f092658745d75d2ab17ca379b959c6c6 +size 416989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f18e1b318a0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a621dcb89b2a8eda3acebdcd829ab254c5dfbdc5e8e752aaf11772b2381b8151 -size 544147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b47072332f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb33e53b308eb07b36c3770ee1dd6eea2a2be0b50e32699a0e82a5d4c3c1a526 -size 417743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..0d599f11aff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d82128522329d497f87a548f52853e0480d8be81720d54ffe9c137b400a693a7 +size 418315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 15aa2e0fba9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68b9d59191e4599c9454b84f2a462e249b1c335fd2cef863f33961d46caf0fb8 -size 419069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6e0943b301f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:010dadf212a675369cc4ab37f1eb43ad8163397fb6334b8580e16e2291404c9d +size 567521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3e989da3292 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e04820f9bcf6fb1bcd1f00c1f95020dc94a3f00370118b41cc124c3b6e42f5 +size 434901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index aab4bc36681..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:894a87318aec9dd78e6729a636f8443564bd1bac071489429f735e92dded8abb -size 567485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b286ff4e301..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:46809be335670f08b944aa7437c9ccfa8248b3427a596ecbe9b0432e37b5ecda -size 434865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c56047f0e7c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da037a166ae795a52ac88747a02b4ab0ae4a8b63bbdee3d608ffeaa90280d4c3 +size 409577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3041d52ae62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c87d44236856286e0bc3e13ecee9d281d262532dc5c238ef869034f25e52ab79 +size 321703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index dfe1d857ec5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d80509a4bb1d10bbf7626b74613d5d5c5890a7f54a83a757b603a9819ef906f -size 410331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c468287b06b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05ad048e0ef0cdad0bdca45ce435353e30265519ac61e2904c6a51f12c93c1a6 -size 322457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..926d4a33abe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9afa085cc6434916b9d8f4cbcd34611c0e696f63b62c374da53830586cac8c79 +size 424333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..91fa06ee389 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd9cc0e4a1c20022fce87b47b82ed7d57c1da87d6521ed7fc08de0b976ce52b2 +size 336309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0389d95943b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:839e9b381fe946458198d0cccc14baa4016360d244ffc0d09e6be705efa267bf -size 424297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4b816724bf9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac2155e27ae2502c98d5753f0c5f5f1b6857cf9763b6f1d46ab589afdbe28f91 -size 337063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..dd706016169 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c47cac7ea8d5457378a372cc6a6221a1c751af1295da9b6967540db687f782b +size 322581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..cada4db386e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f1eb3f4e5426e09c8b4f6c57e7227e25a88045479fce91024e53e3c76228b4c +size 326737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index ebb27a8404f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3851df60574d506931c37bd8ff606f0c7e2ed255820dde54c0dcc4f141ce105c -size 322545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 5f5f31ead90..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:123259fe45e57e0dec04b267a2ec730d6cb0d89de9c144755a5a203238cf06ac -size 327491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..1c342a47bbf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69834d30ed3e1a5fc0ddd9d7fe62d9d5eb2e195df47588485799c5da7c43cba0 +size 344391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..e6d7e6d52a5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40baa3b185bee27cd06bc24fdeb831145f9df131584f4641658c5034e1dd4b41 +size 348595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 6389365a0c8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94741b8c32133040b8cb9a2a073b57c0f4a11e624945a6699c94492de4641a96 -size 345143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 925a841d645..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92e29bccae576590648c0e1df99f5af18ee23e0c8d8e016888553f709b523753 -size 349349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7a0503663d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59873273d5eb5ed86f6da9ae745ba74af573e6621ad0fb02ee0df4068b37eef2 +size 604507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..049d021aaa4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e70ed9571f1217bfb675f60352fafe26841e1df4adf4893acdfae7dfca1e1a +size 492113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 55c1cf56760..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f084e44f8a0556ef7fb5f150c5f72df1fa0db17d96ca97abd41b520f70f73a1 -size 605261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 29ede6ac07f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:95081dbe757b0863aa19e350dc914875fa0fb41a4869ac8e69ae4da92f7ae3b7 -size 492077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0324e97d2ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9541c7f1d5b0aac46f49bfa6cc3d41217c7827c81488d464b4298412d44aaa01 +size 618918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..920eda17174 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a489d86a4fa05ca775113462911a48eb2e59f3ae7eeabbd538b1cf3845b8812 +size 506523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a4b44863268..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b21c637236e714ffe6825e3c11ccbe1e01d0180268f0f79d4a9e2f19c6b4c3d -size 618882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a8dca57a799..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e6077ff6424273cf25ce5f87c273a0b373325f4f0fc4d1971a67929a3bc7ffd2 -size 507277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d1f049e8000 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fdad7f583014c9cd953d7c443828bf3ad417233e156b261e4f27ff53d2d8beb +size 690644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5c55d823d2e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc5053fce7acd506260484505ac2e4bad43610e69d83591847993d8eb3007e4 +size 663810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0ebbec50745 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07e809dfd556183ae2233823841c7e3952d6556b7c8f482cc00d38d14efad7bf +size 576029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..94a99ad8076 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1358eb84eb5db620efe7e6f6915df505d375212f9fe2c87b7584fb3f007c86b3 +size 550133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7b49fd82b03..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f66f2c17ac99db4a2676e04e3e3e04a8b445ca5129a2276fdd97d1ff9f9d53d6 -size 690608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index dc92363cbba..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:869abdcf598be0990ca8f2e32edafd42f316a6bf603f3e564ac42f836cbc685b -size 664564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 44cbd440247..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa2d221a7139390131b1945a327333ac8168a3d4953c2acb4f388158ffcd4279 -size 575993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4a9e5248fca..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d7ee834af3f1c094e4361208523d1821f96a684aa5aba615131c7a4522801d5 -size 550887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f40074ae726 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:048294570d794dbc2fe8410f3e594d5d74e159cdbc0dfe453f8c14edfa71163b +size 710874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..609a65ce6aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd9cf7301a5e38815dce431a51764de1e448cd500c07b3f4e2f422404d7ceb7 +size 684188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2f7f24760a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aee2996f8e970dd49b94b532388676d6ce52928be3ba05d5e724b9d610f3b8c4 +size 595965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1b4a248c1de --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3becf5119591e3a99ec0d3d4172c96b138f0bd162e35ed53d28c3593236ee23 +size 568933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5a27fd69a27..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01b629ef307a8bbe27c7d52ed158cc3bdfe5aff0a073c401eda24c9116989442 -size 711628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7e36c7c51b2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2926bf1545cd9cfa1d9dea1a3c10e39f8267c5105e72de0f2b9f146f3fe43e1c -size 684152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 54a9ab618ba..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cefb379b911cd878f67606ee0b12bd5e74e1d8600188c0d4614e71e3b2cb298 -size 596717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d2d04781161..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fe67d6460fc71c13b0dcb04396105392f169dd88d5d56a63d8b4829b4a1fa92 -size 569687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..99466b4b16c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7b750d3c2f8b60ca9f718781a6b33465ac2d9f099c70d70853d53a2b41f163 +size 631296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..47c89830046 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4185234b71c8ae08c9b7c47d268402634566e9db4dd473bede30b9929f180b9f +size 512093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index bd0e8be7cf7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf77885d43e8b96a7ce460bbac93fdce8adb83249e58455899b6fb99cb60ff69 -size 632050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 91a41e5375c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07ee746d15d1a061c417abaf3d69540b6b6564337c38703b2a20b6dc1eceafd9 -size 512057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fdfd764928a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4b47db4d98c594df52c416ed8fcb7fb5676de882dd7e136bf4ca532f87a03d +size 645706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..363c842ef28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39ebb14b6ded26421f4c05b448248438892e16b3a6f09ad801a69d0313377cee +size 527243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b111cce49dd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:862631f3d85820484bc030aad997c3ea7a66f5464721d4c369e28ab6762776e8 -size 645670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a0fc0d9c8aa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c33b56483a04e92fbd0d1a62e555a31549a873d247c071f8f1311ac9fdfcc77 -size 527997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9d8568b9afe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2006854267fff583721caf7a334c66eda8f712dccabca04abdf20a6567b47176 +size 715952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0951cb951d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c5a74cf4eb94f1f627621c8709fa4d9bb22a73f2ee472e1b7179f86bba73b92 +size 667904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9cf93d11d45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d6131d352b37a2719cbbab63e56b985445a45a8ee64b9fe0b0bf69a92d97bcf +size 597539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..59164d063a4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26b52fe5b0ea75cafa85f6d8bb868a2c3dd52ba23315668c154ae479954917a +size 554425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index abbf4795888..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1bb7bc4b314785e9aa19284c4e3ee62b9c3ff81e72850255ac70f13757d0f078 -size 715916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8d48f093a0c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:90457256815cbf0c03f248054dd9f56a0370c440c45c88ea3694c5664201502c -size 668658 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 465a5addc28..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f69b4fe6900daad0c0cf146110e0ec800bdf959468b6922bca1c47789d4774ef -size 597503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 28285b0d8c3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:513a251256dba6ace15452b209c80a2c3dadb58c63e7b7707f86cf6eeaa694a2 -size 555179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..75c949f6193 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74a2a9d04b64382ceb4fb471aedd3d1deeb57925bc75ec64cb73c74a426f6cf9 +size 737120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..60fdee7cd67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfaeae41382de40451e48b96ae8ca97c52c38c7de9fa316ec34f339ce9d0dc9e +size 688234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a6edee7d3ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c34c182eaaebef7903ba2a0d7535de8d5dfdf04a19565a679c3935ef1c94341 +size 617523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..768c9692131 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18960f7b7b2da00507a72e9c857d171f317322c1c4e8555a7c1e0b9159aa1370 +size 573225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b272107600a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:142f6f33580cdd5ce8c0179e4ae0111ef9c83e7dcb869a88653b775292ae393d -size 737874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c0ff4100c1a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8d8a49e75e0eacae364da7c947926ee47d758de78fa7b35aacf22d6e2563c632 -size 688198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f3a53a1fb3e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:396baddba05b0082ab205b2feaf7d22ab90fc5e4711cb6498bdbba3dea742c05 -size 618278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e04d2683aa9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1c3651ddd22616226f7794860a7b8daea4dadadc5253eb25bc6cc85cb33bf1f5 -size 573979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..00db79514cb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:992c1ee34ae372c1d27ce926b9848c5ac69cfc100ec1ad81319a87e564f3f8be +size 679988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..476694257f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41be5191b163aaf7f0aa8cbd80d354d55232321dd5f99b5e1411af464d7b651a +size 551807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index dedf72b9b2a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c478fe5922e31a6d484a0bb23dc5b758f912e1bd1dea6a2f598a504c2c8172c6 -size 680742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f6601f4863d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18835b7ccc3f0490506acdc99992e0875e4e2eba78deb0592066126d6ec456e5 -size 551771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c8e583fb55b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e27079f3c78ce1427cbfd5f1b9fd664148f1d5fc2c9dd2e4863fa37a1a9ca8f6 +size 694546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6bdd6aff717 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da93cc615bda2667dd5c344a005a16fd013f321763fc9e07ba0b1e329dcfc03c +size 567005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 73708c60662..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a31a24064b149db05f4d26b3feda2da54258a267a121520026c047f408c1a5fe -size 694510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b84605b70b4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dac9cef53d6ddc1b9edf2ac0cb6ba0c368c9e79447f66ec5865129326558f596 -size 567759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..768781f67a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebc676587758d663204b4f03b880fb1f35834cf97d942061cd3754989521d513 +size 772044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e5b18a53346 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39d6fd44f54d25c009885c1d9089819c22eb546c0325794bea84e67cb44d470c +size 676340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2463b361013 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16a9cdbbce87b4374fb7e5c381be6afca3636f12a6098993ea536eb0d171b85e +size 639670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..70a5fb10b6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d47b7c93766992203a192790dcfa938e9d84e4e2286955ed60421879add91a79 +size 562763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a18bd0a28c4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e30ff779f82d52a5e90c9f7b3904f9a3f41a5f8c11fa19c8278913c658ec825b -size 772798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7252d97d7cf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a4ddc7d03c85c7878d79d97a80cc0d6686e5189ce85ac6452623931253ac0e7 -size 677094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 281fb80f8e1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37a88e1064adedc9c2b3978b2aedf581a9c92c6eaec38d395f86e0d4fad933ff -size 639634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index eb157d00e7e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:243b092a6acff3d008291548c9edbef693139fbede884d657e9724e55cb6f082 -size 563515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fd9339131a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c526c2880b737af859ad88a894d78f681cb3761ef55e4250e216c48368f21a +size 794790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7f03b7d158e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ec7a573482995bcc740ef2be8dd996fd94594d37155627ca103319c96b957f +size 696720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5b915d3334d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bcca6bd5f775e4b52d4e6c6eba94e0af495475dfae3955629518f34f2a4d52c +size 659606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f25e37b67a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfeca47d5e2a7c67c35cae78f589fb69e1ea3dffe935f5779ff1b0cbf964b869 +size 581563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 675f27dda19..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2144904799aad2ee207efd5c0ad11188877689d7ec5c526ba7c450a2f410df7 -size 795544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 108fb56f041..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6c33e935421f6a917b1fca804fcc26e536122b81cb10ef49de7992031b6d043a -size 696684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 838e5dff8b0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4df831a272980963e10674702e2f6b9ec6ee51ca946d901c6c58560e91d9ec5 -size 660358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0714cca8c61..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82ab5b65c822799ef35d6fe69e80be180a6af880f7eee18d40eff435c0eb7844 -size 582315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d5b52bc55dc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5720a864a077a510c0a762b9d098cacc4917b240bc4b4c511dedb5f9d2387bd8 +size 591527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..46e515a7636 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793c107df8bf3e78d6d9bccea9d5d971f94f0f07ecd01b99580eb35200676ff4 +size 481599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5c7cf2c3283..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8f24f7567712a2a641f846a370a406590a8ec28afe41d11b7e3fb46fd4293d5 -size 592279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index c29d11eb3e7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b63f906045b1c23b6f6097592d06106e72308899a84e1d9c77611188496384ce -size 481563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..758747482b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a711b2a272d3233915c6fc7859d9244c506a7294c88e98264e4f901e3ac2db1c +size 606675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9668f8a2769 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c4119d5da494c33665f17d0c0d5037f082d8966bf1844a82c61d6a77932b0b +size 496107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2fb33692714..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edc1f6569cbdc3774c98662e00a57e91296c19f7005661a99596a2618e9a6a54 -size 606639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index c46f273d974..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85ce2e12914c0c4262de433725bea1b9dad7fd72eb89760e9b9835280044965b -size 496861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7e7723f224d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:896d3995d814b21458f845a8e897cd03c6b2db046d81ba2a8a63cc3567668766 +size 678206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2b4b80d6581 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad56204c2d0b3a5d370e4f067b62626d04a783c2ad9ed0795d7c475b99a66e57 +size 660646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ac24c3c42bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:948f700803f4ffc735f978635c2cf1c82baee62efb729bee2250e7c34670a8f0 +size 565121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ef280860660 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ed5ce09bc60d601a90dfe31e41459545baede025c5a3286db4d19db9770b8a +size 547857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9c3ff795e68..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0bc5e1383545a44af16b026334d359d3bc0eaedd856ba61ac8bbb6d12fe978b -size 678170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0fc71737708..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1804bdb2f2e3d9fe2aecbf5dab5091db45d3b806bbe23821facf27835c0c2643 -size 661400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4177c9bfe44..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1c51d507fedaaa76347c1bc861b3ac63ddc9ae7aea49312460abee52a0485be5 -size 565085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2a0d2169656..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e004564dcd7b10dc0b61235c9a5696315682c6c48c8d7e8af13481fe122662f -size 548611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bfb6cc4c44a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0412338d49145224b7663d9eee8a6774224777e03fbdadbf7ea78d02d5772de +size 700262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fc78bd99c8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8b93acbfbedac1fce078afaf10c5fff787612b7704b6714ee8f186d22915c7 +size 680236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5afcf2c44d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cddd2e8a0297b8fdb376583ecc4dfe24d8c31c758b79b1bda019bd332996aff7 +size 585155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..04785b1da22 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe7cdb800e47fecb053fe12ddb46e023782ed52866f4721d8069864185400065 +size 565869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b90d12d7d1c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a73c2fafe069d9c3fdd55f90c58e1fa20bae7f2b790a574f0484decfc1f128b -size 701016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index a821888a2b0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6c2f97a55f3d0fd1a5fff75e4eecde1755e8314d0ebaa8e5ad268e72f10908c -size 680990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2bf191014c4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:19f99ba1c603e7fdf548686c1c93286e9d3e5c0e56a6b7d37dfda9ae0fbba768 -size 585907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b7f8a50181d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e97c408f02f76f838562663990c4e1b012a78250953539b47a9f5582c0c3a29 -size 566621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..de603df5d8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c379fe3d229f0bdfc263c85f0d4dd016db73155a71789f46b8f93970f758c722 +size 500069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bf34d9f67e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da4a624158434a36d7a36b2e0e48195918380ec6d8420fede5541cdc5b41429e +size 387331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f10502ce05c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d5104820ee5ff70cff122828301cc47d5185d062c8bd1a8e342936428e04c70 -size 500033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 68735e50b93..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:30943c94bb40051c6f9bbf2d2bb885b21a278833ba1f8a382f2b446f74618f7d -size 387295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e96a9fe1906 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a9228c39e8d20c051cf130d15b939c122306601ccfc1e5e8ad79a407813333f +size 519067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c97e854c869 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da3dcf9ca6ddd6ebcf423d8d45b19d3f1cb8c09e52c82655900af1a229ed3a6a +size 406969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7d5b997441a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cc30e550b7696dfbc6a3df5b65df1948421fcfd0ef9eb83fd0ed19a7d58b1885 -size 519821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index fc241a4ba4b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:816b71ce6ef7141b48f91d26d55b8e12ca46cf299a70630bcf4bf2b416d1ed8f -size 407723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0f81e75a413 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69dd9c018f5825f0e7005988801120fcefca61185291dc41c5fb48251de1f27c +size 507717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..557e7236899 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb8bd533ca74810f7fdd1d60f4e8d78ce4f1a0ae79d47c94bcae2053eba7cfe +size 394189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 96ebdf86ae9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b14737fb4483f1ec7570faefcb8ac30fa89c2edb037db087e7c13af56b10ebb -size 507681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index aa96dacfdf4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:831f5683d82499fb79da8265cd50bd92da250fd5f61816aeafb1c9e28b72de6b -size 394153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..445f75e6d87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f68b329ccf0aca6412bab2237f1aacd31c78ad05fa70f44cc32025d19b4385 +size 533867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4e49d3104ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37191bcdb2f0e5badb18921f6fa96ad9fbcf712cbfefd5321d7d0a9e94f9728 +size 413827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index a4470365870..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:670c41515b594dab1aaaaac435ce77e1ceedd90b6d96cfd020306f66d5a58ca2 -size 534621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8141f966226..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c640a37e0a23fd275460d60654d490073fc3b5a6fc36d740da7c156b4d75c01 -size 414581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bbfdcef3edf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6029dd8911cc78ebcbfa572ca470d2cb6f1a914a141850226b092500236622d7 +size 515857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7fca86f96a1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51e2522ce8d1ed31f08bb18c4622a7b97f6d2e75c289e365b8c77985b853a7f6 +size 402871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3cfaa27ff80..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79d104e3b6d6d530a2dcc85166771e824cddf7e14aa78c39af843dd9ca8e761e -size 515821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9056cde4a6d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ba4c089006883fdd297e40e990c2bb51d875ccaf257c6d53c8a96bdc8861b5e -size 402835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..73b9a657428 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee8aa6182bd9b10fc8cbfd93af9bf502daaa9ecb09c32d8d1a329977302d2720 +size 542599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..68de86f255a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5fa09bf1bd52105c73c2e04f78e1f3839f424099a3ca4b93c789ccf15a5776 +size 422559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5757afd6fd5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a95b4b053755143cb8ca2e4a737d46a0165b4812349124ae2aa75adc61b9ae01 -size 543353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f93154dfce0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b5811ffe47e59f86c996128622340cd492a9386684b440274fe125cb002f01e -size 423313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6fbf81851ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7ac1c1fa35398a8e011a67a56a66c8f56d1e1737c639204bfbc68d57da608f9 +size 496215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d1a4479eaf6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba8b65b46251310f851f27324f8b37d8b6b07f0746fe035c922bbe008e2a0b1c +size 383329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 893aef308e3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9db16e1085fe71f00f26df0fc8ea0e2711f9020d117b3f7641045b0a5ef9c826 -size 496179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9f3c3c638b2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3039e2a808d14e675858483bc1b094cf598b44796d700c195929029e8a1e67ea -size 383293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7447c9f6313 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d2944579fd79c9095e4e0485525704f5d3b5403b1fe073bdfa3f6739a448227 +size 514917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ddd433f5c84 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efca7f5a8808653908747f58ea74ba8e059251295d584c5c5c90150ca36fb980 +size 403017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 35f5a74aedb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2247043ce06828f68cac608f0b02b5ac4994758bded20025510e0aadc34de5e1 -size 515671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 69430bdabd1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db477436591061330b8cbf8f10b65ec5ba429bf15f066ee91fbe4fc1157e8180 -size 403771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a070b61a9e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a9199123ea4ffe49ac74e0d95573c3e11b52319b89a6f30bb205998e1b11e3 +size 502135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0637bca2729 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48148cac5c17237e9f0de74bc0c284dd7dd36078645f2949248386f45c842ed7 +size 387523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3e5ccc8518f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36fcc8165fc1064149d0487aff3b7dbff252e3b6165bc74a58b14936ab0e5ef4 -size 502099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5be721a4d6e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:435731b8186adf8ab440d112233cfb2bc8b0c76f4f0a0212df776f9cefcdaabc -size 387487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7f141fa27a2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32eb9b0de26dd75cb0311bdbc6ea6599f7b0c8391d72c42987de351e1ea4139 +size 524685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..26bde0167dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bda40317096a244f62e5542828d3f1d6b28d52c5b77d57e04ddab96deda82e3 +size 411947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1195ac08bcc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:100609ecf453ce9954540080ebaa14ea7a0a8d3a735c97b8040e9df807d0992f -size 525439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 94988bf9b99..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:84084f2aaee558e305b8b2617648403c83d77a7b3448e184ee5128fd23001c3c -size 412699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5a26735c556 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa3c7c6b1a61e03eac69a062c31039ce39ff9d0af8acc39b64f9f069a801b48 +size 616927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index be1fdad9fb0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77b39c016bb796756445d9543540d91513d3ea8dff83c9b3f5a659b90c201bf6 -size 596171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2279cd3d617 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7878904aab5ea3d68a5962d22f1f4bc722f8041092d11ffdbc9b10906b9687f3 +size 633556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1979029e6e0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d2e3cf0e8de805a92030a92a5f39bda17ef6ddee19c1631e34a1dfc55afea34 -size 612109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1d13f1fa5c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb13b1b3d9c97ec04d2144fc44ae11a3b5e8135d0147e1b72864adfc3e61b24 +size 606073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index ac2d5e2b001..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb05fe9600a881c236a0c2cba0fd074d2a96e03f22f40ed4d457d5f6b821f817 -size 585317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d701d7ae982 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:581e013933dc8cca58cbd18c6346a68eef0f51d6c0a7999f98ca31c8770232da +size 632028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 07acdfe91ee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2dfc7fe224dd3cb9468777ddd71bf1e788bf7e8bae66bbf875ea63e9dfeb4c0 -size 609001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..53da7a3c373 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fee46c476130e2d6b2cd0d7a921b7ea253d68320908d353624a1564c69204164 +size 560387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e24924b6f7b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cdee8ce64b38ab842f06a7ef708b81b17d7484e802f1311b1a8ef9bfed72088 +size 426631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 32fefbdc333..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6d4b7b588550d022af058461e34402aeb90aa0c3efbe5468ab96bc4e55de8672 -size 561139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8534ccde4ee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1a98d0cc74eaf0184d9a80e3d8661ef0ca9f82fd7aaaa08ced11ce727ad8674 -size 427385 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fd3da1a2692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4fbba84f8ce5f99b4b44332d909f5c0684ec8a20ee86c82f47456d43ae1a3a +size 601037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0d6951f50d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:534a813dd4e2a277bd8c2801fa4d40766c051fac8e18d8d7e9aa47bb776f490a +size 467135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3d07e6d12eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d9a77e02d2b1760324244086650639cba150fa62b0e22dcf21ac0a57dd96ea6 -size 597695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 48702469152..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e3e54b9d7a8ee6e1b20ce6026ce3469d89635c222dd38d46c71eb323da7a9699 -size 467147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8a8a9c62b30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe965b15a3f16da727320ef38cb31f91bef0b04b1ff9257d3dcc73bcb9ccf36 +size 575091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..214fd84388c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b756df2d799f538c2a95c38c62756d66bfe202aa3254e91c9da8b15f6ec91ff +size 449723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 46f982f26c4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5db00e68894b61ebd19743674887ed06873e3ef95c7b87a29778f96dc16de6c -size 575845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 439efc9692b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd327597ef621b9f59363acc3516215a7a462632ba8e4d4586f3f9d469a8e2db -size 450477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..17da8223816 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3388b7f078f94bacd1fca9fef322b48eadc9c06dd38ebaaa48040daf51d3a030 +size 618506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b74c1c7e004 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cae2a4446ccd1f72f3b277e783d63d0dc56df2a90b87ae52325fae53af1a8c3 +size 490275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 076112509cb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a7863584280215d23666fd023bced384b2cf90bd9a4f8e2169252337985c3a6 -size 619012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index fe62b689106..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7294b99c77b76f0fc2e047044c9ced1a383cca4c7257516d5943680f73013ed8 -size 490289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..68c1626bea8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:176bc0905d31decb6fed9e74d63b6246df6ffa5f9ce57452012d5361169fd750 +size 616343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 630ae405885..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c09d68446c0f2c175832464a1977afcf68aa84316b128a73afc618f293239712 -size 596425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bf83c99f957 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b45ed453d0f2ff5243b7c0c4bc6f45ce49250b094f6171ae8c02083602f95cee +size 564679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d4e9c84305b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a116683dbd7d062d5b824e20dee33987eee862f478a89172a0b9db1f140bf19 +size 429987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0e78388ff70..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d711708a45194db8f9837b4965a1ddc564306738afd5781eeee67076e73ae6e3 -size 565431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0cdea7c2b74..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:247edff686a7e9481eb22a8c3fa91f82935cc2ad86f09cf72248d5d141661b8a -size 430739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bf39a149fc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dae520734a696922ed421d052bb44d7599192ea23cbcc912494f5057daae05bd +size 598571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..faa45e9e0ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65db805ac4803ef4c0e917fa9a1c48ca3793fdc915d4a330dc0f0425dce1ed00 +size 464471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 537344197ee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e8b3dfbfce805368a6d3bd86cd16391842de496f516111db6215a415f714368 -size 596857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 799ced89e37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f0b6ccbaeea844e76f76859528ed54e2d673c6d16d0addbadb64b824286dabd -size 465323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d093c582de9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c7789b3d3862970f69d4db6847c74f5231330209e3ad96880332fffb8072117 +size 589201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..394179d0b96 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23e2c0225fc0733a39bce505701efa51d2290c813d0be0a1b7c3207952411685 +size 453127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6cb65020e28..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42ddec16c301a20c057d4e0ec250afcdfa83bd8274bb87b0a798d9d81cb979e0 -size 589955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 61a9efdaca3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b378ea9c29ff3fd875e44372b58c90a63431fc2f213befe78b0e154bf929cd57 -size 453881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..017baa3c3e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151bd2570bd0e103397099877477f64239d0366ac989ed958a95eba8b707bede +size 623834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3757e57ca6f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b355e275468b19387bd1964bf283da62e91390454aaf6bf3c7b4e9259b5d656 +size 487611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6a0fbacbb79..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fec1da4b9239437f7ed8e50847f9a3cd2f97c4024c842727ff5b392efae38920 -size 629324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f9b9c9b3a9c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5bf4a9dfbf34bb666ede11d8853562d99a0584b13cb016ab31f9db869e47eb5e -size 488463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..29c2729122f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92f0a805209318722ee4a832dfa57bb230a7b2d7747b2f8dc4f5a2298c378b4 +size 601629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7ac52390d7f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c570e90ffdf4fcc58d6d9abbbcc972e6d41700bab0b5cca3540598ac8bc904da -size 602235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0bbcfcbca08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4988401299d6a54a3d08ba7d83e34baf7fa55a87bffecfbd53fed08993738554 +size 622304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c01f87b059b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0fb533838fdc2911e92c5610f4142ca665aa45f0d1825490f4c693c189e1f999 -size 622220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..06a6577ee99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68562ace7cff71e641a5a62fb3f5885e5ab1a9fd3b4d07577e910fc026c335a7 +size 589839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index cc77c77c044..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c20ada1c7b1a680101880e019a84cba3d04d44d81015c77c7b7fe5b3e9e32ce -size 589703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..188ba882f79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09fb1883954b91657092eb91ec48b3c1c5193cc719fb6d84aa8e491970e793ee +size 619000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index da8e7dea7a3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3fb6b42562ca860844dac0538dc2dc3b54eff41aeae3b87504a32a6e52e5664 -size 618914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fccfdbeaa0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dc92e2fb6bad5236ae23b55890411cc922e5dccae03788d0cedc156649b0a9b +size 563291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5a5c46b6568 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f5b44b681e4d5337af13d33fa56851349a56d225dca7e06ed57a6dca89e6d4 +size 428451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..50578c2624f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5650563a12076fe3d2d78024d87aa2ed2577ca248bc6997aad78072dade9dd18 +size 653292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ed920c6a272 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:548b0c11c9e85155d96db099c08b7698977dafced83f97ac009f1554d45b9f65 +size 484361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3f9403cb6b7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8aaf360dc2191cce86c168239f04efe89f0a07d8ac8a917fe56b05eb2d5bc3b9 -size 564045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e1aae5f3f4e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0138fa74ca04b21ffd1daa66bd7f24aaa40b8918564910186eaa7f607a1b82b4 -size 429205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 43b942d8f3a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe9b103d6c9e6981be54431f2df310571cfb3eb67345466ce561c2022318eb90 -size 654046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index a9f2318c05f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:591b3aa2749fc5782ea2cc7bf56c07e6485ef47285a52359d3fb74855ff15ee0 -size 484325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..97523418b7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd609c55507158c69905bc49f45c5d57e726c5a60083d05f93480af0c751e532 +size 579081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2233129ff27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0bb82062a903e391fd8bdbc5642e2f5136de2805abb5bbfa272b0eacff44785 +size 452381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..10b993f6c08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:398b0931312385175fd41f1f2df0eb52499a1e74be193badea952390dac9170a +size 677124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..359ac1d89e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8327882bb53cf689404a8317c75062462a3eea488e11ca5c0e26fdfaa40a9476 +size 507453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 45f2d91512d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c24cd1719f9be493dc4615c471bdb30fb8e4b5981d6f7153ec19dc203f70a558 -size 579835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3d82224d6db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2bd509152dd83a6b2823e5eef91968cdebc5225ec9cbe36e6c76d7c6d842aa5c -size 453135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 523a9e63bc9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:950e614aa5ee8cf6780c375fc2dd9ecad0b4bf7c325a7ccb5b81625b2c1fab31 -size 677878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 84756d7f92b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1f7eb4ed5dd1f4fafe2af6629c0b9e0dc082ad7215265bd56dedfc8938abae53 -size 507417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ed792c30bac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790cff7c149b3889b30d53dea55e334f4315f3e55faddf8e2333998a65f53c81 +size 554411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..169767d17df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db70ded6e971c5585fe4404fb61c5b01fb43baa27c4196f7062ae8350911f986 +size 420311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7726c590439 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2e2eb7cbd15f02b77a6da7683308d53167a23cb4f62f03cecdf7e1baf11a3b2 +size 648112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d9c15a1fcc4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de378810da29b28322432d6f71bdef11fa70707607798fd6ae7dac495829e892 +size 475481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index ec395f25691..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15b506645d34b1116e92e5f1657c927f682668a3d9ccc95678e6da45c1083039 -size 555165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f94481e5647..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7dcca8109082f9f5e6ea59d057e7d205ad2e4221da1fd5343e4af9b206dbe20 -size 421065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4535d0b59eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:46f9756d89b22c101de0c2f551d038442cb13909e5861b87bfcb1f86691e6729 -size 648076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0e04e3191aa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d230273f6e9075cf34fb75770005251b89c2d1325c803cd8c1e8bf1cef6f0aec -size 476235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0a11971d7fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69bae00205b6c18071edacd871c8f72ab75633457ee8d071cc712e2d55c19de5 +size 576713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..49de3a6960f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65e2ceae0bca54037212a696b9f6572cdaeb37990362b92c5ef09eee22065610 +size 449915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dc5301ecf60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561b7f79bda7308296283e12a13130c4d919495d5f37fa0ee1539a39103e6f84 +size 673030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..40ba3d31c59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd18d5931e61c8872a846b0402e8595682a8678ea367c0a7b778bbcadf944d8f +size 505085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index febf5026b0c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3fdfa9ee066e35d896e1c69bf694d1c84c16e8f4233e3286da5aa2a52bfe3077 -size 577467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b221622ff40..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab2d67f394b69c8007dfa376cecb61ac94b88a4053f3d089ffa3c685a80e757f -size 450669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c9ccf021435..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48eb0a0d641c9e61cfbb7956cf3bb1e8240ef6f79b7518586889cf5466539a3e -size 673782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 101e8609381..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3dbd5b6c90312f4eb9b6bd8ff0ea71d9b35226d2762c2b59f2f2f77ccaccea14 -size 505839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..91237333e29 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07a13f1545439bd9813d27b691f22f48fc304727bf4e74bc06466725e51cde57 +size 1190360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d223e1b0959 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9784a34d420a40abe1b250846599854cd746927c77595c6a59e5a664f4a8f029 +size 1162736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..231cf0ebe55 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd5ca289edb227a1858f32c2d557d304fc210f30ad3a0d74d624184ae1aeadeb +size 1087994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4fcae5a364f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a77aa4212092efabe3bde525bf7d95005996a095df1944768d6fd81420a85f26 +size 1148622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 06dcf46883f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:673bd1dc6c7916519025bd81eb92471db1372f3e5dbb10fc211e59c918e32732 -size 1176314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9f888bdf22a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d58709010617e9a1d0678ba03449fbf7e7efdfe33098b513f4c0b45052058405 -size 1153524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 68bd93c360f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8ef6e0276482d98a1df11c7f85c7feda618317f824f564d9e4b935e8a07a667 -size 1102462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9414b7d8480 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ce0adbb0c9a83a1235564e3feef2f20c26a54801e4b64034fd661953efb33da +size 1207878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..943bc6a9c78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64a4c1f6f5c0be92beca85153f6dde1d6178e0df08be1e209e49d393468c90d +size 1184002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..069f9e56ff2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88979bb4e55ff30766c91b7f6ac1690497b828d340e464f22f6ee08c4a2ebd0c +size 1104426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7a0024a6d2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f333e40aeca280217866d701627dfd506f041cbe8ab319d3ab211de25972df +size 1170974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b62d62c3516..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cad7715fb915148d3bbc0b637fcc86a4f747bd843e20b5bba73c7b655de6c303 -size 1190970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f6894cdb632..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e294df2731992947e89dff643be6d5f577169d2f6598b407846c5a918968885a -size 1167884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0b079804bcf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:527ebecac102abfc03988cae0e1b04beecc33f1cf474b9f17607ca33d0a96863 -size 1119190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3d5f5d15bee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8df0f2c0d2bffd0671b2d48dfb4afdfdd58ff730a2c473e0221b42fcabb0dfb2 +size 644780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2ff49cc3d1a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0aee944b6313f8c6451cda9a5b5db8984254b04429a1a88f180c5290bfe6a6e +size 642610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8d7584d3433 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bec67275df9123ab8150544225db4ffe10c4c6409ed95b51c6383a43aa3d07b +size 629732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7562b3edf9a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dac4da1d0f9972f95ed5fb285057d575496ac8b89fc3a8e974713e77c2e08b02 +size 634466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2cf6a3bb907 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7a9aa11d3fe0b2a08b1a5f27556592dea771974b413e363022fcbcb0b759e30 +size 520595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d3fd570ebdc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efa81c9e726d63ce7f29cfb3b4ac6f96a27c87deedd3aad8ca938235baf43616 +size 518425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5ad7c4b62c2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4691452a0ca6ac92425f63153ca6a7a75453b0ac63729e69667a7969c290b4d9 +size 506189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5637d778c2f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da45fdca134b1f3d60136ce57a1a0d5ecc3cf2445f5cb19c4554a414ff19a0ca +size 510973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 742607a7cc4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d61b2122b9e7fd6d53d8b2be755812737ba1e30ba809d254dcb62ae5028327f0 -size 645532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 059b97fd4de..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb25c19a1e3ac7172dcf4283960d71b4a38822b5673b8f4485b4f2265ceb4e5c -size 643364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9d737f11302..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:254a3320276e2f4b2f7a705059135097539d2eb72acef28156da588d918e79bd -size 630486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4f57cc4d106..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8165e035c95345f962ee47dc127a25fc323ea082c7bbe0f55d699a06ab7b6d98 -size 521347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 92d07628e7c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0dd19ffec88d19a18ce8e8969b12535e5dbabcb7d5396019a4412c48c18f166 -size 519179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 465ada9f6bd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a7ff006c5f60e99af7852a634060d5c7aba5129b26d522d70102e8097b71aab8 -size 506943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bb5576996f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a841c2db0bca7b8f68d3edd2185fc0c79acdc6fc1fdc2e2617628ee9736d66c4 +size 665700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..edc99dfaeef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7deb01e9efc95ed936c9f2bfe5a7b0193d686d90957c0370a52441cab687296c +size 663482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5c3e18da46b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaa4d92355546b56dfca36cf21fd9f23df076704f0f2a4eb924406d44b2c2431 +size 650604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9c929eeabab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6c6a1ca14c1fcbeefe379d4013b4e0c07d86a5831ff5727a4d5bf94200965f +size 655388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8ee60b65c20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:665d15ef84a2a41e0eb72474ca5ffb6f08ed5531650813c0500255ff7e805639 +size 541269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a77be530f68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f02ece725f416ee577e44c1cf9ef168fd3d191a6326810c723e21609fdcb2f +size 539101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dd2a5ceec1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28c913fd7efe9b3a4683fc615d5a57c62141b6d82eb8ce28c7bb6e078d190d30 +size 526863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..504ab267bfd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2da0e52edc69a73a3d632a195f6f2983a3c505cd7d669771739d163d5cb49d3 +size 531647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d6b1dcc5e0b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:769f9613bd4f437eda4d702f09729624d4749c89f08446a874f2b11bbdcdd2bd -size 666454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8f8e1566e00..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:458f909379ee25fe8446f23a1be12a497f219abce8b52581f8e340f5c3421715 -size 664236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index ffe743a1d2b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34f7559c449a0e7581d839f5fda9fa46b062da02a4f2b9359583df01155b428a -size 651358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0c09baf2147..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05854454812c42b4da4ddc885b9cf44cf5c180d711105d8f5d492fa96da3e896 -size 542023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index bf679bc0abf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2c0c329f291e0b986651f225aaef8fc086bec5ba7538e7f3e9c49cd27996626 -size 539853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7abac02035b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7461ffa6ce5b8825d0732cfaf14fc9f38ee2ef722eb1588408168668dde20a12 -size 527617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ad1d1744b95 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1ff8a131670093b02b7f9f0a17c9dd01d6620c599a6743023b802e75d75a03 +size 696392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e125584043f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957cfbafabc33fa928df8bbb2aef7f39c1b70c583c1d6b28c08f9d0aa1a516e9 +size 677054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b01b2be5c1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb9257eba5e9fc191fd8762191dd9c37de3b0aedf175852dfa40675833c44f96 +size 680998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..08509a91e13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b39cbcc0683b8b71652f7e777cc6f0e9902323e860214cfc2d47415544c106f9 +size 696738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..31880d21bf1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2b38b5ef7c2b13ddd3cb9fd29534b185c6fa6ed35891f691b1c6ac2b435716 +size 673946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b22dfb066cb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a9572ccb3c60fec2970073fb2ab5a6e2e28c038b074c8693c04eb6eaaa372e6 +size 677940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0bcfdae7144..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1c7c7e1718aafa989a444924deee5cb447c9e83c989a695adea9e13154f0f6bf -size 696356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5ea36ec717f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:03cf8b652148ada8d03cda73af2d36cfe3a9ae4d298efbdd0fd1edc2b1401284 -size 677018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 42217e94265..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ecc7e69fb75ee362f4f51911b1f3a82e7fc77c9e1ae00613bc5c9e2da2988fa5 -size 697490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b898972ee6e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58e33723aa94ce3aa6b78e01894a54cbd2eb808cb145356bbd64657a46ba7e83 -size 673910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..208fe32e834 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a467315f28f41c935855857659176e7db4a5d629eb13cba111877f580657965c +size 633870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c8e13200c73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46858e8b3ffe01603c7fb285b4d3bb3a58f5efb7f6a3c374edc2639693fc582b +size 631652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp similarity index 81% rename from cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp rename to cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp index 1d8bbcaf0a8..329c2b7efec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:242db0aaea906fd2995acff603077bfff586354de4d673a0281b0ed8c7c26c6a -size 328925 +oid sha256:ce37f01cadce9262358e5548283c6f2c02fe5ae438027f6b7f58b175a2ae195e +size 615517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..996f59659f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ef3d9079e5bd84f9fc5ed0dbc8bb87508bc9821621d84a5082b133fd72b7aaa +size 620252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b19df27b3f0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72e14cea09333037820e6eed1aed87a64681038886b6de6e792c488f34707560 +size 502681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..647d84beafb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:712ef43454c5ad08ec61762dd6a8cb6831bbd4eabb5b64ab348bbc0f65086138 +size 500413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9c13154b207 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb99ec84aac88eb988bc7c59161ae1255b26d5b9cf33a68ffd2eb9b23282fa0 +size 489755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5c3e636cef3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d25b141b6cb47c499087ac0ac7cc81f964e2e988010dc71c7ca776352e55df +size 493749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4007a0a9033..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e4433e710f49e21fa0a8bfb2bada9bdd771c173a5fec4ef9f4430d06679e2cee -size 631516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 97839bc4053..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3006390b5a6628af46dd81578c682094ada87cf48cc137f281fabb5f89da00fb -size 628508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index ae02fa1546b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae95e5490bd59e38bc941643f8292d19b9b5bdaf7ff945d645c231f5adf6581b -size 614445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index cd965e891eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48c5ed940eb728c70f1cc03ff977884b0ccbee557cb11b0eed81f655f513ec90 -size 500621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1b6eff02d4c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:743bcac21e80d2ad64766d0460ee1611db509b981da9855dbcaff323bd78c8f8 -size 499095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3276938ca9e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:193614b16c678f9d0ccd2df74dcf71df019e8c2809f65255e84604aca62fdd3b -size 486907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8dd210d2be4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d90cb51337c69e4b0421e23b8fa6773df128d73b84e10547d39c6b3f7075d68 +size 657850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6b30d89aecb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a39b3d27b974c846c2a30fceb5735b5cd5b8a0d6aef0e2461e80103f566270c +size 656422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1dd19929c8c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e77153a743b1b7bba4d64f4711fe3f56bce1a223e965e9305bfb80fd3ea8774 +size 640190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2e57b721f49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79bdbf9d59dbaf3efdac30ec55f0b579c9a592e42aaba1fe271471e7887fd42a +size 644924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..287b6da6682 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:099fe436b7f1c36292954489a4d2ef78e6d3201c2f8a10d80cb5370e1699ea2e +size 528189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..984bc7f6e17 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5bf4cc62e95529055e4f48d7519853082fc2cbc582c74b774d884748f8c6721 +size 525231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c29fd9b0859 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4db06510d92f6c7cbe265bb78a7c639fdc8c81f4dcbde83dc611ca79238e91b +size 513833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..da6a60a1240 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60fc58181588ec6d43a324de8b506e5726eccd5f1b695bdcc6786db69bc7ce23 +size 518567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5ceeff81787..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e981fc4d351a52b1609a696bb879fc4ec18d143a59bf85095596b48800b5c4ad -size 656186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a675b719faa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c894a2fa3468ab6eaf6fb15c5454d3a7339e5cc4102c5529839ee86ee29f031b -size 653180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 443d4c8ad7c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edba8658c79a0b34f24d6f1b2afc221a922ba9267e18dccbf076af512ef11a37 -size 637834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 31bdc306345..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6084c5f8b5b411c0d0f49b6db03bdf23e36e6afb5d5627a4c9dee89b54cd88a7 -size 525341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f0da5c7d1b3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10125f7fef72677c6594d49e52c218cab814393f94166864b42ed0e8369017cb -size 523173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 77148681a1e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af30a5f6c89c482f2596ba306dab9a671a8716c16dec5fa4d84c5a3eb5731632 -size 511725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2a51744bc30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:392c728d320e49edf35e6540d80335d686649133149fd81eb46fdb8701498e31 +size 836034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b53f1f58d3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b5e848ce8c62f481c36415961085623fc4f52af242351cb91e0cdfe9b47ef1b +size 812158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3a80a8f117a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0c39782516ef5a7146ab1c536f54c47da3f852ae0a168d92301834357740f11 +size 749354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..be17b633fc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b76b832b518159f34103b40ad02cd45ab3b19f2d1fd7ef588be46a35a80c197 +size 786550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 742917c29a8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4e2b0cca4589e387913d1ddca5e98478934b51215987266f2bb2a379c591c1e -size 833678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6955c073336..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32dc2f3907a4332c09dc08f5f260d14b41c956767a1ce6e12d18c5b5d7c252ce -size 810592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index b9b4c7d0cef..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:430948b4639371d964740347d1f28e872a53f99b578e2becef15fb082bba7716 -size 747148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e287597fc28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:140db5250add529a825972053005c442934b0b330a59eb6ad15a3bddf47224bf +size 695642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5e8d8c49237 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf047d821bd76a88c12b8296f4ba4a195b76a24c39072b485270d900537975d6 +size 690464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4e50d253be0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43776405299f25349df5d6bb46f6862800e8f3a326df50f6e57c143ed99e8e1 +size 663132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..165827965e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32b8311ea5abc24c867fb20f77f412da62c40dcd351c28bbd9f2e6baedffcafc +size 672700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..005adde081f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65eba59fd134b6dbd6f7fc7d41124c7aed1cb150849b5d541b54a273a2fd717d +size 573579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..51c7d749692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc5ecfda7269ca110176b244f67261c1e1a05d0655125a72f38e6ee9f6a0989 +size 568401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b6fa0d8614f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7f47a7c1b1871e35aeada059e8018d8a69a11da1223f1dbad76e4af4a0f757 +size 551921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c53a39e119c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c75b2911fd95ad6c81ce4ec8689acb539fb9d35c83faa15d2814776aa3f68c66 +size 560701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 92cf0133d37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:73cbe62a71d114f0ae133a96e53feac979469a5d0fee643a090ab5550300be1e -size 696396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ffe0ba73736..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:194f21acb1d42e715ab4d787f60f7e751c8f9dc2c400cbebe383c344eedc13ea -size 690428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 16ac054ef71..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:81b5e88c6525aed5422b2450974efbd81185d3d5c8f271acbdd996ef275d6628 -size 663884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3e40b6e338d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f14801fd0ad6437868976b612b6bdd35547868f36f7699f4c01e1a1581df83d -size 574331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f5d277cc752..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8fe8c999aa8d477b72d287833bacae3231eee5e318686ab9087507f9e9f0e95 -size 568365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 75b4936440f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:263fad4505e9ac2375ed73d797f9bc90800536da66ecda326e343fb6c0031f01 -size 551885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c202a017fc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bea7965047f198914fc3339927f9cab3bdcb81e5ddfd8b1be4ff5b4d64f7a25 +size 714688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fb1750824e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd43c54b2bf2e7e997ae2eb6e95a33f27cb41729f38be42b19f976750a61cf8 +size 709510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..72773aab41b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6257cd3edc83d6e9f0bf03966fd2573af6127e6019c730d8734113694e0a989 +size 683164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..817427dad30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3fc567afb6a9b5f283dbfeeffd9a3ed45d911fdca7b5914a1e32807c046af88 +size 691106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f51b4c550e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8365adf7f92b7c8468fa717e119bd6df97d1c4754442edb3cdcfe50fed6113e2 +size 594303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1e8eaad26d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:290e38c4e14902bb7edccf10b0b2ef6ee732b18a3a166fe30a8914f81b18a2f5 +size 589125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..49209972364 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6d88d3b6a1400b5e323b22b977e417b71b48f21c0951ce8903c82ecd4e07a4 +size 572595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..04a80401bb9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538570baba64dce78b9f5c96fd2595dfba5719b5b8a1e7a630a60efcbdf43ec1 +size 581375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9f3c8cad18b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c64c8090cfcb0c52b81f010fc16c9d69cd22fb41ddd65aa43a144453ef0f191 -size 715442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f42215ba6c9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3051d5c1456d00deb9017d54f3ac6a9afdc4ff991ccfdca943113b954dd9a60e -size 709474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 738e2d96380..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d80e0a18c7f368348b42b088791e728f0e57fe389371b9c2793e7ea9939554f8 -size 683918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2d8c9ea9bb8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d7d471a5bf098637e7c46af675f0dc8d6a1239efc55ad122edf631425734f5f0 -size 595055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index e362c05fe60..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7148d74dee3b57479b2b405b4f8e7b1f439058407fede8e3c63c8d87eae0605 -size 589089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 475505ef012..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f7b7077ddc35a6f3058362d882d85467833bbf6d35c09cace7429f9e56f04bc -size 572559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..459fe4bc32a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd5d30018ec2c0c5032bbdee632cfecb81f8a66b65dfbeb3d24408199b61fa58 +size 744788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9a185620a52 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfec1996f8229512678bfb8db6ebb9bd90689f10fe6bf11082094b1abc4894a +size 706900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0b7bc703fd2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c998a9bb868b9d452724c4c3c1341eaeed30ffb1cfb6bf6bcb48b259820413da +size 714840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 08e1dfdffec..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b3bdab1f0fd7637b567cf679112a3e4a4861204046d195264ba8464717f32f69 -size 745542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8b36a525717..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:581f662df10ce856d69a4982ef9bc01b66d2ef20909b1469db59d1777e40b194 -size 706864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3ec2f7f2881 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc347a99a67fb94bab28f89f36dcd36aeb000a7b7966b69fd4726707a16cdb99 +size 665346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f814ccc8a28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81bc88c0dded79bff5eb254b6f39eeb2206ae051b19f97d4d5a08355d6e321c9 +size 659378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..68faf0d0c19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc460505528c69f23a5777a07789edecb7e71d002338d852250a60ddde68291 +size 639396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..aa46a91b7b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f0e0b37a1d08220adb81f166cf918f805701f8e06ddcfd9352b424ed16f3119 +size 648028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f46601f2941 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c56911fb6fcb3260240fbfed2577a1f6a360fb77bae1e3b399c38b6780894a9 +size 528137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7d0d94b1692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1deaddeaf2108717560cc2c6b2a5a9a81969d7f44a3ab92ed7a7dbd88ab82f3a +size 522959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7169a071d2d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30044b597298a7681a20d9f17bd41683854e7fa7249dde82ff69086ac8131849 +size 507367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..102947360c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61b89daf624bf330f5070b82316c5527a3142c7c75f94f864f05b5f3bdeb0fdb +size 516887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index be46960fb8f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e525610d6a8c5c88ae0ca5eaab74e554a09121167a0dfc3b00ce4b3690af8ef -size 663582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d7b009faa41..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e896494ae5f19ff0196b3e8153a288325b1b7423d0eb6bbe625b5a46a10a23a7 -size 657616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 687f9199804..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:09c9f972f80322e3a630dfe35f08b8b9027ea6b8a98f67fd87e7dc201207f389 -size 636844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 019224e3b09..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea940f1c4fc0c9ee76dac8e9c4692e38fd035d49abf2fabc2184dca09c1376c8 -size 526077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 26ab794fe3f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06ba4338a5ba2cf2c1ede4eeb82fa1f0caf6607716e587f0cb6a10958e0aed40 -size 520111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5eccbe8d06e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9f95ec711a387191fb623fe9d6608586092596cb86ab09f51f2f9ead9c9275fe -size 506147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5f6c3cea2f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6361a3fdf02901d67df2b62a95635e028f69424fcea27cd8a01bb1f87c784af1 +size 694802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6214a687bb1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:714dfc76776e4a8489cfaaebd11aafa58b3ffc41a0f6adad35de0b8f7ff1ffab +size 688834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..be4b083e40f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee1ae4fabc99f0d415e0fa65997086252e5a232fa605180c3cd4cddb9b2c848 +size 664362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6cc31a70f0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0266660380a1e4b1e76fa70a0542d1cdd2d769abbe32b8990847b4bd0a2bc7ad +size 673832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8bf09947878 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4164a90209efa8f19e9d71febd9be60f03d6b9f7355e934b29d35a6af0a39bf +size 551969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..22caf6d2055 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82b0c0bb7055786c373e2e7fb17c7627977c3d7dd36f0004fa49228b9a12d85 +size 547579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f62154c141c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36e6245e0cb5959ba5f44823f78376c1fd7e657d71ad3712b171823505d29c15 +size 530311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7e5ba2108e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78390178a5846dde6787326e05ca5d3f4f319a8d735600baf12bfe344e151a9 +size 539879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 292cc9aa737..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:142baaf35b5c21612ff27e9d0fc7bc9dad954583de9a0c8da4b57823c7381b2c -size 686774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index c672acbde86..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a22f778662c1540b55e63b58b3d8a956c944db9845e184ee19bdfe28297ee90 -size 681596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0b3fd6f0803..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:50a277f325c07c2aa9ce2d5409f88911f5ec70cfe24706ca8e97ea6d06f6a3b4 -size 661810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 88b06d25e9e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a14f4e7ff483cfe171b3144301ba88978526fdf57e1c497ec5e623e09fd41dc2 -size 550749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index eb29decb155..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:90b45482178d00075c8e6f3fabc05db5e29a7c5a3d8f5cbfd2aab285d58a0be8 -size 544781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6158adc2799..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8de2714d16628b0e55ed6ec52ac496009cfb16ee5686e3032114c7a82a9bad1f -size 528253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f12312ee2ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:403343e348cb3da8468c610086d0adfb37b6acac3173c3e88ab1e65ee43f26d2 +size 909650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a8d7f31dd78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a3fa89007aea5f6c73c874f770263d685f458a54cc5c975baf4e57d1b2ae28f +size 898502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..86a3d270555 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22b6fc4e09b55cbfd7f22ab1d56b8e1e9486e6966f7abc4458f99567d9a6b1f2 +size 867816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4f1a0c85f68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ff3eda51758dc0a08bfeb128a1e2f86c4f24e6e10a589d069430fdc3cf2c33e +size 888534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8af7d9d80eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe9f3abb72623ea521bfc375a286a2111ef950b0c42ef47878ac393d1bdb335b -size 907888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8f89ec3e07d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4feb4ea798816872334b799d8eb6b139334c177ec27f73f1f53e3dcd03ffccd2 -size 895950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7d2bbca3243..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec505f6db2cb5a7e70c1de79f25529f2bd3a89fff43b545ebc869d61deebd153 -size 868026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1b83453a5e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40d9963bd28b7f99a51b8f4f8055c6f311ed89492de0edefbe3ddd0b2e730699 +size 930276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6fe0dc7e667 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ced7a8b43c45e6c74dd891b5cdc7d252ebcccafa25cc9cfdc9c111b6420ba55 +size 919128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c8750681192 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d271a85f675b083b517c6f9f0992156f34213a4c13ecf4f188ecfd5cae40927 +size 888194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5f1b850f248 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e743d381bb90a374228cb3027a144765fb33b9ce18f8d06a1b8e974d6f3f52b7 +size 908912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9fda233aac8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9941d66c2f3e3036c0e8b2e86a967de11693959e6b74f9d402b4682ff988a9b4 -size 933200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index e12a7f582c2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b8a6e72ffd7a3644f1aa1383c58c794d8bbca82900e21d19c640eb546f257b1 -size 922842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6c314868873..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c190654bdee9b8dbef2e2d7f2593e1f5d25d1e01b67c32a096f86f967d139ed -size 886628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..95312facb75 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966ed2b70fc52627c84bd546e88855fb0a7e387af0b97f6a16813ddc956e2b3e +size 733770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..91ee7619cd8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ba10494df124341825d6294c19a47cd86c78cb5a28797122b6a1f944977c37a +size 722574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a1b7167ca1a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffb4999d96fbae9252e52ab4a7d0e2f587c7d6f8f0205d50987353c6b18f28ed +size 691936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..88ce207ef43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fdfe2d46ce0c30ae0ee23b3a49fe776184ea93d53cc2b15c8aa42ae9136eb0 +size 711816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a8c517b1ac8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8450a1e7978618d4f58608bd01c3bc0e10583a5ad991163cc875b86a78509391 -size 732008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 15569a5d974..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6c5b5c0a0ec3130d76cb7f6cd16edf5b8ad8869381f7ce24c2b8cd58502012b -size 720860 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4183163148a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9eab3f726585a68190e514dd6c1e887ea7fbdd7ae9c72432783d0660e3405edb -size 689334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b9768dffc40 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64655bdafd099f14f498806859644ed2e0c43604b65741bcd412cf4b7ec61734 +size 759082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..06c13407f86 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b61fd15065a4b97c50c4adf833c6963df5af2f0bf4a520f77bdc494eb838cf +size 747836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..daa5b313a8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d764136daf299f593efc5367b829fb2fd22bc229a1d2594e637f9bd1186e7b30 +size 714436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e9204903e3c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4fe866a2a8e0d58d9f87c33403f3dc6903a80c43ac14c634494efd5f3d34003 +size 734316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3ea6dfab238..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36beaf972a9963534060b7870692338673deba5a0d0e2554572b57e83ec2272d -size 756630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index c92e338b9ae..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7edd35fa28f506bc3766833fb73e91ebcca07a74669f093d5220469a71450656 -size 744692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d9bc62a2a3f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:80d05f404a682b05f0f168aec3ac9084095139a2b583662dfd8c77815363f12c -size 715140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ae0b1948c6f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5582643cc85895db7cbd38fe91dbff7ff2f103438e51b5146df046f6a2c7376a +size 609993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..24381665239 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1660a9389919915e22f6dd4bb19038eab0917419b2ea402069cecbde4ea9ba23 +size 608515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..49d10c743b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a087226bdfeb30ebcf59f96c6366d3490d8796fcb86df3c5d4308c2162a94f06 +size 599237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b1e3975287f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd44b9c76026db519dcc34d6450c67751fdeac3649856bd526d31f9c7f91c8fe +size 601653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..eda378ebbba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e98aa2b3fb0332a307e561e07677f4169a613d00b03701f59f957050b9beb5c +size 497205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9b5da33d988 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9c19586c7e67e85ec237efeca9c00f61dca049ac86f1132d62ddac672707e50 +size 495727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a23e1867ea0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad6160e95cdfd591d637dad1e2b0d101ab974212d3893e033264680d261f55ff +size 488621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..50fc069cd87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be83e239e51fe043eef45d2a8026c3202026f752dc2149dffbc3b97f97a07084 +size 490197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d95c87a8e67..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bfaa46a281c2b41b77c3ffaa2ae48a5a98d3bb488c2f0a9fedea2126a81df2b1 -size 609957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 886094fe8af..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d342706d58e4a92c3f8dd3dc4dddcaf0cf513a118752f8c3010ede508fc88a39 -size 609267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b4ad82bdbd3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c71296b533d7f4b8dfac80e3a37047bde7f1d34b521a380213b1d91daaef1f85 -size 599991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ec96dc00694..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:235e77993c8e07f916bbe048c774354d1c84c362407f53fc40d8c8b4e735bd31 -size 497957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9a6bcd37c12..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52ab43a655880a550383ed77335a9ce2b73b272b5f81d58faaa4abfd759fe7fc -size 496479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e470cf8052e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:25f793d56d7bc9a8ee9d6a29f2aa70361d1fb62123f02b4e615a8f5be48af32d -size 488585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ead86b21b58 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e96a47512fb195db2b2e19e2072ddd43a5d526e1418160522c98469aeca149f3 +size 631014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..70417ea41cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b778e55224d7eec04ab361a0543cbf1c6b4dd786cab2bdaeaf408956d9e7916 +size 629536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8300a2a965c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3cd9ea8d40c2c780f3fb26ec2a30ab8a07bd489cf43790bf892a8976697b2d8 +size 619272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..27d3d58a884 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b23ae4afd3ef514286076c9b0da20dbfcc08f3f1b2acbd92d1d9e3ff56e2895 +size 621638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b3fe7672cbd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c7ed60ca6364c97899b87fac85fcbcf2aeedb132e4132c9812c8e30db85701a +size 516645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b52df3b9b13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6342dfd0aff099679577bfcd775c0cb73fc8350f90e3119ef05e7310621bd26a +size 515167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2cda36febb3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0396f679bb56721fab8f3d64c4fe02a9decc52b485dc3b4c29f3a875c7bef6b7 +size 510035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..db5fd5e88ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c518c169f2c384cd04b6cb1580514fb7fd0ee1d8b6ced828f79d57ed38d8d503 +size 509687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4e800e2abb0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eaedd887a20e812b6036230294337e192bd71cde16f08e9bb91a9e6460124d0e -size 630978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9dcf6428adf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b2d9e57a8ce5d426fe384b3553329e6742b2926d77a4d2ea15b74c640387fdc -size 630288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 566e4cc0e81..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c4b699c0f49f96aef4b4a4a3074fbc1e945d6a787b881a1465612458335877e -size 620026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index da4cd811e02..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20023944892f5a439dd391b1cfdf092f8f477dcda63428181b30da340dd578eb -size 517399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 434c3d521eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccac55a6aa9b13cc579b56b59c5964a1a390fcf15477b7cc4bfd0e13ff1e2968 -size 515921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 10ee319bfe3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:819ebe2f91611c878073ed3a08fb1bbb21e1b1b50f2f6e3249a2e594d07067f3 -size 509999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..dc5c2f1d0b8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12eb6a2da853282837bb1481779d65c55ca30908ac0c5949a2b03de643377a8c +size 424685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 9778b80bde8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92df9a0068c0f3698222aae2646e3ea179d9d721dc13b7dd399dee9e8cbc5b8a -size 425439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..713faaa4fe5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a57a1bc97714db270e4e394ad6f108cc5b14feaaf2207e4367a6273245c895 +size 578955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e87c86fc49c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9879f25cccac7cae69b1db303d572277672bd97c13d58996e4bcfc80e274b94a +size 577477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ddb8bc04200 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb250796c250510726838d258cb94953ce6b80d92caa79a665600a624b9b0a91 +size 568991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3550865782b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e88540fd3584b4dc585704c307fbbeaa48ace7f0def98524fb92bd613b3f9f4d +size 571405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9e1379ed38f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:391b8b0e99e15b625fc281f52fd307a3f5a4300b7e9662ef54f5708a018af92e +size 448851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4b75f3bc0c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eea3fbb7139851e184f32dac924227f67dd31d678d90b92aa0bfee9945d1d8b7 +size 447325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f00c4c580a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada447d4dda0dccdf7aa4bb5942f8c11a9017e7cf34967e2e6eb9e4333ce92ee +size 441255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dc364ab826a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd1e748b990f7f595b1ca13136391a28ce31fe6f800f12ea34e1c8284204daeb +size 441845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..30b1f9bb018 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf3e0a2b7126c059695da62513d2697299ead2f32d386d4a104c290f463074bc +size 676850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2b8e1092cbb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41bf0c8b7517483a6eafcdd5bae2aad59245bcab56c84befec49d253bfd79ea5 +size 659682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1953428dcd4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a48ee11195b1096233f2c4d2fd4abf75445466d59186f02a6acb19237a5f4423 +size 661258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..73830167e68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9753763545f81f8a45f4366a9da2febbd490256a5105fb348e0f431f3708e0c8 +size 678132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0a39766a5f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67316be36394607c70ed486a8aa8f394ca010d4b83bec29f20bde2f47d63af02 +size 662100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..00434d9bcdd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b48f4fb57876b6983eb43d62e58d578800cff7a40510e85ae6fc78f1cdf59c7 +size 664466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5b29e6c0525 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73b123aa5d19d776a37e3701cc6c401a45513f76f6ffa856615a44b4f26e1274 +size 679316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0d545d78998 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa8f9e429fd845cfec98e837cb934232705c9ce430b171aa2cc5f076c67c02a4 +size 660472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6e3820c96cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea0c33c4013c210e07cd1a2e539b4bd35ee88e3cdaacc58423cc7e5c562365f8 +size 662838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f7acd84b41f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7838faef1f255b6a2f123f0a7287e6321c60ab41fed5c9e77386396daeeed96b -size 579709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 85ebd9ecffc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7fd48bd5f64b2f58fe54738008ced544f4869d30dc98461bb258acaa9e90222c -size 578231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 080feec37e8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28bd82ffd0c65ade02d382c42b44e5e3f1e626d1925f84ed87a90c68a010d8ae -size 569743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2b949a1caec..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:99c9b47754aa8878a1721e486938e7a2027e687af85df79a6a27a5c0a3b74c7a -size 448815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index dc7ffcf5dca..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a6e473d70dbc95787bb3b0c2d0518f7504fac657e1dfe4f535f3706bcfe3966 -size 448077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index cf77ec972b7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5d30b5b870c5acc78f3493c83731d4ded530d6cafce15922b7a367b29c6b545 -size 441219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 10c42956768..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc9720b4d3914fbab942de0ff68112a6ba3a86acc55e4159013aa071e5addacd -size 677604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8ddb0d0dadf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68cef5ed24bc3a8e75a931a7a2a9d679a5a7d5d1ee1347426d739a057b691c8e -size 659646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 540bcbdb377..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7bc309379f1e4d4afab223427fae7cb3c85c5a5b6ce82207b5f18de6751c1d60 -size 678886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e4a813f2eb2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a97325fa3899d8750937927c390054ba088904ef2fe567f411b79cfd301b9e2 -size 662852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d99325c09db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aaace32ba6c8e9b03cb4318d190ea103daaca04e9ce925f49c15233b1adfb295 -size 680070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e6c5e9be8c8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:39538a40066332d8fb18e1ca7351941b2ef3689b8fe0def4b1e51c6e31c6873f -size 661224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..2824494ca2f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88a1dac357781079a31e733ac4a4a11c44a0fe5e0a363a8baacb68ff50844e9d +size 448617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 3a9c4fc9498..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:501f6feaf4931387fd80c1eeda45437c3df938c22724881fefaac32704dfe41f -size 449369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1b92fc10616 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c48be1d48728edb50ddab14f13768aa7aa18c5bad25409f6dd452dc23369dd68 +size 602147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d71288d13e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c3a2ea19aee2ab0435a90d245922a2d5695511e9586cfc8ba0b46d0877004e4 +size 600669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b136042d556 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:079d6721a7ea482cb0f334f17fd3da3afaebe72b33dad49f8cb29c0818477a84 +size 592379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..24afd54e3af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a53867f8a4601061810e3aa7c4e9c4708b11198112596e27e31e36224cb2ad +size 594793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..209cb8e6daa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32ce25cb9216cba59331e0ad867b94e0a79d965b3d5117b92c534c332af174a0 +size 474263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..317faf50f82 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:978239b87fee5aaae0afba6d125c078a2ad8686fcf409a57142c27624d429957 +size 473573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c57f631264c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:531b8f7f765146bd99a7c478aeb22ac9cf8ece9562c0911cca5d470341efaae0 +size 465679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e2839617923 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ca8c231c1c507854c752b69c6149f874b4c56cc70ed418e4dc9019439b5fe1 +size 468093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b8c64af75b0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34faa9afdae6e96b5cc6790c173da975bc7c96ef8e69febe0da8f6143075a678 -size 602899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9c56649e86b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ca080f32e23af5e590f62f820aab2c6983c8d59a53dac42027109c9d4a3fcc7 -size 601421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 15960d1eb82..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:915da9064326785b02f74b3dc8aac8b2e17dfa96ce98a44057f8e630c241c137 -size 592343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 584ec63e731..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bfd2413aa4057e63e02af9570a9e5355480b42961174733475c4389e5087def8 -size 475015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f27ad300d18..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06c488c5d42f3f22665bca689863827a23ab59582ab0a704167672d2b5ecffb5 -size 473537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index abfc0e7412b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a975e41b2afa9d778e9cce6e87b948a0c2b5b6cf9d472a7359913cf8256020e -size 466431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..d9ddfe8da3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12355fb82ff14a8c11899c6471def92c9272177fd3c1b40c311dbacc0099a608 +size 567577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..828ae589558 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24e3da092104a5543e0cab974d146988d533ecf2e5c2a530f681bfa3e22bfe9 +size 570637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..9550a7c0fb1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bed009a68d35b3d0b2d29105be18fc507075e6b222902c893bbb74583d241112 +size 457059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..2a3e372fd09 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c147da1e103a7bf4a35f9f0bc959ae7ec1f273e355921e090ccef6e8e3c08bf0 +size 459781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 56f9d651447..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1298e19b2d7ae38ded76d4987e952545db272a94b67295815802bae64ee15f1 -size 568331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 5bdfb3129fe..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1caea40f926e0da9b88160484a0a991f941070f994f2817ffaed811fdef12ac7 -size 570601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index e8f5b6d94aa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:792c3589ae7b14ecb68ae3ec68d1e5e47ab205f4254ea110f1cba0fc16f07956 -size 457813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index dfee57d1737..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1b9d8376604d5f32734124ce821044843c33e2d02b7017af05b935e406a9dec -size 460575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..4513de107ab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a811aa2f6618ebc4b3441f2cfb5f01bb9a23799885d7a5b547d054831cd48f +size 581849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..0133a15be30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a964581cf4162c55fa3af0070cfe6e44d044ad9b9458657986b85717d7efe999 +size 584117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..50089392996 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:925b4f6e4195875b6923507f25c8781d93d984d817fbde1dab3856777a1ce7a5 +size 473105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..74467ebb475 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46b3d9cb5c917ab0be41901e7be9bc79690dbd73e19b187474ece2b8716b66c8 +size 475079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 572c349aa26..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8c6185a05deb42590943c817fc4d11c152362b1208bf34c309601b8e55ca1a0 -size 581853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index acdae51fc35..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f361d4c389cfadf39e4f493fea812da19be067a352a7f5e65e7d84d9825d4348 -size 584911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index dbf23023595..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9717ee135b5a24173aa669f742397e65c53997668777abae681019b217e1d8be -size 473109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 3eb3017d06a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fcf9d047f91bd96f8af4f61d003ca0b9353a5816c2fe9d465b6003ded6ad341c -size 475873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..567a2cc6ff4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ec1946905b8faf0c652fdf4a49cf6d0a9e0b2cca38004638f60e10512714a8b +size 588603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..117915ccfb7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2cbd8149ed311ee8a631639625451f3fc73f57ab1dc4e0dfe2c40be8e7f673d +size 592451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..4aaef9f1b2b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cfd55ab28a7f6f7af4e8e131daa296aea23600d5bd8369bb6a59e5d4592bf36 +size 478281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..a5daf770c8e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a487debcc0928faf16fa6d2e6de81d01d3336494bb529b3d0d73518d4468d731 +size 481045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 4a57b38ab52..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:917831ae10b8940efe8ff630161fd0312422c40f6a1915b1be919edf92cdffdb -size 589397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 69ef988b3d8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:355b6734101db63d10c114f4aa84e10da2834aa8822846ad023f51596def5051 -size 592455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index b307084e1b1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:150f668601cd3c01d86bebb929ebd8497b898ddc81b39fdd4abaf88359502498 -size 479075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index f16813cad05..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eed348993fa798cdf4beafb677ee78a884ecfe0c905debd14fdb9d73cb91d4e4 -size 481837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..525b9a0205a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61102a1bc4d58b78221a15255c0f9f7eb438407561f8ecc656f66644a098b884 +size 603703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..8d7ccc41b13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b31975a5be26e15f2aae776988ca02e883bdbec072cb9aa3a044fcce25c7fa53 +size 605973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..e0c01f3d7c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44f3c86fda981f1d8ede091d91fed98b3e12c0fe7c26c27c8a9999303acf274f +size 494221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..76de2bf68ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:143eab9e670bc250d21d721474a53f982b67d6d3fdcfb0611c8f8e5316306094 +size 496193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 34e1aa81dcb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2298bf6cf5788cf3300b66bb9156446967cf047016bc0174b87b94c2996e369b -size 603707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6f68831ffbf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75e8f83c3afd2edc0e533db435858b57a7793ab0b2532d95d2217ec02c5f81bd -size 606765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index bbabd83d8f8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b36b79b5c954988c438a24251d2baf74f696caebdbaa216631e61e7841af6584 -size 494225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 36ce8d1b163..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7d29000078aa1c08f4f1d4f4e5ecefdb22f2b0051f5e0894c47bbdd6cd58e9cd -size 496987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..daeaf787377 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b73e8167223c3a1f399cd848fdcbd103db97da2c8e409e5a92be886a3667f8f +size 650370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..49dede27749 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea98a1238dc6721aad34181e37fc0e4ac0de596a81c7035214434a9818756951 +size 653428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..47a8d426a44 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc88494dc9462b7e3193513793d3b8a2f79083c135d787200037fcf1d9f6d934 +size 515627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..308b33e026a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8f1f3d3d4f9557fad957e3b838aaf48042ce790c4aa34e61726472d15c24304 +size 518389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 4fa5ddc429a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05b6467613af8744f8629121df2f05895617b7ecdeb11b4cb95b22a8fa9509e2 -size 651162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 8e369235a3b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c15f7197749a025b6afef0844b4b7bbb2e3ee88edc488bbc81d5763c3d7bf96 -size 653432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 29147513c16..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f2466c83a355fbb5ce491a385b150aaadb48d3c5c5f5e8f77df0155e929eadf -size 516421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index c46b383d727..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2f8287828060a7e1dc7f1464b5675b377641e206a40581b1b20fcf15be1505f -size 519183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..f03671886c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f91c0f0aa0cbb7ca02c07ee1ed49366e946e2ffac69332586c787da1fdcb214a +size 664878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..d9cf6b397ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5a2a4cc6e2e0bace16b003fdd36ad1bcb2c2ab6322677ed0100fa8f2ea342cc +size 667146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..7fbb64f03f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13570fbd8018fc013d9d8843828122135aa558cd50f0299fcc518ca25a25c22f +size 531565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..6bf77502001 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50b1428dd5e776fccb42032579ad49aad86ded322159bcdaa058a3769449a40 +size 533539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 6d80b4ae538..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ff862760a0b4e4f8425494aba7efc3433f37b1cdba4cc39a173fb7cd023c551 -size 664882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 35ea2d73fda..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d696e0e132dd4d51567a92dfd39262c326f897c5f78f00cc779fa83f8c6b601b -size 667940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 41c55a0b72d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3acc3707c1f1e85cbbfd645e685f5baad4ccbeef3fa2f248136da1d0b5810fa5 -size 531569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index fabaef6785c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc36ef026d9b00358ca2619ec055dc6241fffb77fb531b8f845ae1b29e7c4bf5 -size 534333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..c12dcda7ba7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b71e7707ae745429c5a60ee74ad414d23ffdfe6735900447686c831a564a5b0 +size 556135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..cf1195fa9fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44bb3339d733e280c946a4cce6eb20e517e3931804c2274942824afbbc25c2b3 +size 559983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..31ecfd4ea34 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8488b48078b781606df24e5babc3891dc9caaac2cf4bc377ea76832369ce1073 +size 446999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..4420bc3a4af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb3c74e8924c77e6efac7efd9ef2c38add3d748eba83d4f1d5c1de5dd374c34e +size 448971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 4fd182ce1f5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7912dd5919f4021cf0cae6c6f4208edd74afc1198c841f01f7c00ca5e831dbb3 -size 556929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index e792858106a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:836e4538fd764fb4e25b3a999bb9305ca13e43a13a6b532daccff76dd40b53b7 -size 559987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index e0d41bac20b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92ad854ff1f54749e03b6b7c13b34c43a689292743cd266c25853a9dcae67167 -size 447003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index a4bfd69c712..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ecdeec591f730323d2dc4f70523a1d89f7d0fcd070aa4137871d5f091c7c4aea -size 449765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..e333ea009e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba9e386f1e77b2851d3c8778d915f0598977890c245daafa3032d25be28d4e18 +size 570447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..c6a8dbde7d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5846683c5543496a3dac2a92902f7d323bfa0591068dd8758c2ffcac3be675a5 +size 573505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..7e8a85d5c93 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c989fc4f6b47438ad8694f2719b66c7f0526de163383d9bb54deb01dda123adb +size 461507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..84066c63a8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cafa299936a6be1ad4f54602c75cdfd0c240967d68971e93be1c800eb69d5270 +size 463479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 6fa6e1412e1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cac91fe71eaa83240d937a395961e7956014b7247e4cd5628b03b6e0774c862b -size 570451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 8a09ec96278..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42c1582fbc190e71365766dae8403b74d2f4523fbb44baecd3b3c44a127b731d -size 574299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 10a5b3270bd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ee0da5cf1590872ef34a34fbbcaec8f38f321d289c5b6c85f927953895254e7 -size 461511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 091ca70d093..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff65167d3f8abc86a5b2ab0d73a3b4b955ba49f18e79371e29a4e9a67253e5e3 -size 464273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..cbb62b09b49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aaedae00df3cb56cfda21da58bebd5562631c3810a32ae265e449cca434b5bd +size 642668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..02e27964123 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67b719c384fff4aa3d73ab695ddb9d40b8b2da7ef528f525052720c0af800395 +size 645726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..869b3ae7fa4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f65be076e45227abd8385404ceddee2fe8c36be7efb7d898d6a7025a7bce822 +size 523465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..ce00cfb8deb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c533f1f4609292c7a43aaacbc708a5789cc98f922faeb4e6ebf637eb753cdd8 +size 525439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index aa5c2111d7a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48eb3b24927bc079e4ba0174347faaa23aaa82240f7b758b431bd155e434de86 -size 643460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index ea24d7658f4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d6e806cbb73b0df8f8f45d8afeda241ee69526c0763e81bb1123b2e5d45605e -size 645730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 51e42e5891f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe804e30edb266d40a1decea025c615dec51548f1320a7a96554daf89fed215f -size 523469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6fdb99b1db4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89e5d57371d1daf5a1d1d75e49dc5609efb8847b179698a9a4acaebc828a3f2b -size 526231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..a27719a13b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2dc543f2b3610ce7e520e070804f124c638bfd260bf466f944e74cb5499f43 +size 660974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..ebd1f5c51df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72133ee71c3877f12ef2a6929f1ccf49310a6dec2f955f645c48c9003b957eb0 +size 664822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..97396b6ef2f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3217556e699f5156c78d69cfe2c3b21db6dbeadfd95ca0713db1ca229bdbe3 +size 544287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..59b75f6bcfe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99dac529bbd0f4a2f8bd1c3eabcd314ea27ddca85877ad42c2e14efea03fa6a5 +size 546261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index addaefa0306..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64ed119823d8b6a8620d2071316ecb3a7e3efbbaad43516e8046ef5526a7e752 -size 661768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index ea589bd5b69..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01e235997a644082116bd5f8e63bd4477a766fb2af4340363cc3de551b02682d -size 665616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 4bb65740eda..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7848ad1d3c72a0a2f0b25450cf6d28a9012a319a4b03fb593dc22a3f06679334 -size 544291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 3388a02492b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00aa1668874956d0b4e2a47c2f3c7507f6b99c5d569cea0e3ad2d593e51a58ae -size 547055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c96e54a7364 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a462f34fc3f18db05afa19eb08ba2ba0cadf782fc1771d1bcd1b7fc2d2b021 +size 892620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c95d81ac288 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:380263c53324599bb254aaf9da9e12fe4287d279c3b0c287c711136d31b3ee88 +size 744360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index fdfb8e2e47b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:526684f41cadffeb7d83218452289ef418f030ff0b241f2421a8a6212ebec05e -size 893314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8ba3ba641a7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32b8e2fc93930700c6436409670287d6467174c8e4c368dc65378849a9752024 -size 744020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..084442412c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1d1f5648ca6e5ce9b50f118b70de72b00bf7a3c375f8584b9adc09d38dcc29 +size 580389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..86cf07787ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6874a4e394d83373780a20831f7f362c069d03ea402a4e564d64e8cd51e00b48 +size 531055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a9ed8983f21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b174bd3a83475b5c73b017bf3e7553cb055854f274a2ee5e0c402854b59e197 +size 581865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 654deb6c0a3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7cbc425ee845d54a728b4057ccad43fe4f8633f1b2411b6fec5281ebd700b0c -size 585229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7b882e1140d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70a849eef32e6f2b2ea4ee5835b19f21e320f368dee7b19a56a2e48852177c4f -size 533229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0e951dd3144 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b55a88dbcf8de0a8133b66663b3f75d30b43dec9fac95d9f80df015f370d324 +size 916008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..eff4284ef25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cafaa8d209baa3c3403bf265df5b2472e17474a6634905ea4b294ac89ccabc65 +size 761434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index f52ba50e2b8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b9a9a21b261e197e585ff5bf9df1a73ed63cc1238796ccf25e063141145462a -size 915912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 734a9a2e55f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3d0d73673f3a2fc7f0bca1b9ded5bab8e00f7a81f314fd8f908a03eca8a85ca2 -size 761092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ca013e2d4ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19ad5c5646b774a28da3276b25942feb8168259a7303f25ac458a91437f517b9 +size 591789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..22eb29491c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:422fdb785b819bd9315c1a6bc93ecb22c90e12f478bf98029bcd67ce0c84280f +size 542405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9d0c5df425c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b0729f3a51c215ce4a3890e960e7bfbab7a05eae0e562b9f43089e266ae6e4a +size 594301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index fb931e1755b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:051ca20fd6f3c053cb97a2366b6d5fb60fcd8a37e06a1c4c09aa2ffe0fca48eb -size 591005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index e3abbb41792..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10f2bf91d2d56593a5069cf98715f68ea03217627cf6c99f1f95d3a180cea9b4 -size 543247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c5e970b4d7e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d8e4b1ccb75ac7a200b3de6797d08a0b882e4dc98ec7bf015cca438843e5431 +size 573673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dddfebaf29b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e552db9033498a81f9f7bc00b5dee1209de2b46165588a374075fc40ec2d72db +size 448797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 293a6a8583a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ad1c495b8d2ff469aeb010493aed1ed58c36c6f079cf6d5a1e11b99d08f1010 -size 573677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 58e26744fdb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2fe4a2a71d8a5c16a00570042ebd893736d62d7869a4fd619593f6827783d8a -size 449591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..53099e53ebd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e2833d705020345d1aaaed4c61a3cacc32892d44e135dbd19bd221b551ae8ac +size 598541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c5a3719ed58 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7c3a20e301f873aa5dcd4e7621ee494448e23c7afe56077bbcce24b3e4a98cf +size 466709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 680b5450300..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c309b5a740e055cbdfebda21e45f7f0c827134869dc05e58fefdcee92cd401b9 -size 599333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 74d1c2e0202..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a8cd039cef6a0399ee96b8731ace84ff90a4aa744edd5ed756e29fdc45a8894 -size 466713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ca88234df65 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589d4b08fc5ffde9980ab12c9b4838c17529ca6b967e8f50a02c747d5dd81b94 +size 434891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..45a650b1f68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f003a3830bc67d2475a4fae14d0dc94d8c83d9cbe161549decfcb291eda8079 +size 423297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e02f3e2464d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e17dcff605060494d253eb8fa3cb19aa89d870e7f8e0a559f59d37c8549d25fc +size 428079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..89f065b5fe3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:862f7a25a606ebafa35ea4418fe57cdde5bc1fd4045c2b58fce56e339eff4e6b +size 339567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..73809961446 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e883dd813d9a10e86bd3b8de433419d5b45905282637b8fe6bbe17994967e93d +size 330093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ef93baccd37 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ace73e8c2db0fdaab97c724c14b8c3d396daa0d127e0f5234902e69f8b32693 +size 334087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0a5d12f42c9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f05fb93984a1f7f64a83c9dc8e5ef31bf644f795f5fd7de4d4a4723ce2841c2 -size 434895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 58a79480e36..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d608a60d345a96a0bd7d52601333a6889e45ae02e18858ec8fce60b3878b6af5 -size 424089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 636728965b7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d97a2df942fad9a66e3fc432281f707da7bb47b76a4224e63d214b9de08dcde -size 340361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d85e760be15..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a92e8ba843df8aea2d7cfbaae4158d39293d5f53ba354d2d4275738c61678f59 -size 330097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c4ab3973d41 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:201b37fbfd00c47253d1b6616a4d876d7a230ffb82e68af47422a9e09ea389f0 +size 449499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..34b37a407a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e7c255f731e17e7b3a6393985ac1168485b30f1e3af0759a3ff61c3ca9c162 +size 437953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..37b3ac75684 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5861de5b765ca5d8cbd7b5d6c3e585d48210ad326065b011b940f19bee549f2c +size 442687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ad3653198b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ede340f9e2c4665bfb8d3f4d72823ad70771e3e312a803746571cb0330a21fd +size 354125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..23310581da6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c1c4e3bac00aa2105cefad42edd3177649bba6b561609faf03c81352017ae47 +size 343911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4e98dc40e71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d206ea353fae1f517be789d3f7d5d934e9fd8060b8ae3fac7b4900f4997033f8 +size 347905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 36b091edce2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd2bec22e11a92d1791b7bcd88f534c8d2bca4eda72d2fb75980a18f28d14344 -size 449503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 26a384b4d37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7896a31a37e08ec4dd4b8a67d5bc2bb56f29903036f314ff47611e09ad3bc22 -size 438745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 353b68182ba..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1f0bc82b7a822c0ea98fd3b35cbfc121f54d5ca2421cd7012e472265b4c8760a -size 354919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 632e3968917..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:312ee5a9aae02acce880e84d643b9e714cbd2f87ab2d053b7a4533480a43d4d7 -size 343915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..74a9677fadd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9056068cae55a4f1d3147e2456f2d49c3bf557ea4cfdfe18baf0b6842ee74169 +size 662530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..24814ef57ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e47c8f4929691a45897eb5de12fed04165bd871c430e009cbc7853a4391b39c2 +size 593411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5ed3ce83e4f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32d437e1fcb12c58a4398d7ca1f29fc9ca52f9e500af22e143b906adc9839c0 +size 649898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4d971d71c60..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca89604d7b5f69f188e946f64d025e72cda1e0422ef5dcb82f36c84a0032554a -size 663422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 59a62109b7b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3915d45f62f5b5957390605d46638655394b0e2a19c05e35de165c878a91b09e -size 592923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..65eba942e01 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ed653805053f7adc7a0317beb952cf00f9279b2968d2de4ee4d390d38581002 +size 673980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..27698b4c540 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed4f7851454966a7475123436c8816dd498f8e9c418a0feed536ee12cc6a044a +size 604911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ac627f21037 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a778f5ce5b1d6fa7643bc5f4f75c9f3b8084f38f010a470def105c8ac653b74c +size 662186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index ed3d837118b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:26851e11c1edb2e66c7a356c8aedd02260cc8d23e29fab29e56ab0ea3f389565 -size 674774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index f1ef4b8937a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4cdc63e768e4673c005c50ce7465606c1d2efc73629b6c3ba424b5b6824e30c -size 605753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d037da3d01c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5459bf2c334c1dbd5491cb2cd69aee20681b699d32cd03400c9e52757472b590 +size 601809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0fecaf6c9a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97227f75807cbbb77686dedd29551fba21e3a275334ff26f1e8f136109acbb16 +size 549217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..75d926bc055 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1db4ec3abaa48f56e7c43e68e080a97ae066c7267a607ff184d201b3a0bc296 +size 587399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index ec57f6f2268..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8f7e401af3e71250a3d464242d9d3312d1b4234de9e43388402e94bef45dba6 -size 602601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index b0e59e6e33a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:44a69b5df55ec1e6f5a236d45d4780ed5f0b13108b2b00287ce4642e7187fc00 -size 550553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..75bf39c23d2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63eaf2b8d12ed6fe88acbdf2c8cf3d668a0c9b765bd105609eebc571638b859 +size 615033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1d00b8a8a1c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:633bd5933f8144fc40a90fd509898f60ebf3f21dcd86a61bdbab7149c5367c34 +size 562689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..92066de4595 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28aaec101d4aa36d29c590c4d26563252b4fa66a63185b0b3c3375720eb6ed07 +size 600081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0b369d58e03..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3518c7bf7d7e6bd49b4f7d5035cc78ba2cb41a6dca5369a0387e650da521d20d -size 614101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index f1909848f52..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12b8570595eb8a4601ff078ec8fb2ceed7994e1244df7c791cade81f9ec97726 -size 562693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ece763a4b3e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a259af922414f72601a4eeda15f418e540ccaa219ffe5be118011cc7be8bc58e +size 624930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..07ea0cc883d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2d697073f43afff1aa826d838c6257056618ab1fbcc07eb7864ec420f603127 +size 495615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index a0767eaafc7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00ffd2cc40037c680ac86b686ace333e5e12043e83b1cfa23c55bf3191f79782 -size 625724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index f5a4f0e964a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23b6a033425d34958c14fbcf54eb3bbc340229e1209952dd7aad9ee9d361b83e -size 495619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0c4c83cdef0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5a65592ca2ba8b38b4e45a40f5bdefe10f32a75f126a2453f73ff0b478290d +size 648614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..88b408893ff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a233d7eb32bb79957aa3db6074c67b516a2079081a1b5c930d94ba0c54bd477 +size 512885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9f9ccfb91c8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bde71729c556dacc9ea90b7f8a1130cf61fd4c5e4b8c3a732104f91feed3c7d1 -size 649408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4360ae460ff..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:758aacbd1344e8c6fc7c250017b9dcb5e5c9825f1f87cf356b830f9791160780 -size 512889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..baed2113994 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd2b50acb4508871c43ca5697bc95632c895bf0dcc106bf3f08d524c7f0f78db +size 454379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..603c8528d81 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff053332c6d0e4e7e8cc1ca8a3954239a7f6b75f483363bf16e1841fd8e8215 +size 437653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..21958ed3264 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719b4eeb3b32e6461ceee5eb6d9095cbc6d0b6dbbcfe2d9ebee93375d1c56a58 +size 447171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1428f481631 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f52cbc59b0b7113ec8b9878b7021e606ecb5f28360f2e9fe03123692ff18ba6 +size 358363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c754679dbc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2def776b52c4842e996ffd6fd5410e7d53b006142ed5f4093b53fcf0b8728ce +size 343759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..96abed2d4ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf6bf85226fafc37a3a60667662011583ac8449c559ce699b6ff40b15d9de634 +size 353327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 33c6070b271..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:13f3fe2eb02f905f8343b704065b2c05a15c7b7dd9c65e25b99d6ffccd2780b1 -size 455171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7ccbe3f47a3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:65dde9c5c1361eb2db5183172f2f5718f41df46d03d7b3ac8af3697e4c5327fb -size 438445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7795d202edd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5791bfd62d436633304cbca36be501f14a7d9262e0a7f0264ffa538167ce1102 -size 359157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3aa2c0e67da..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:331403f3d4a1530ebe2a6fb1f3fe67f49a032db5d2b4f71191e0364a3303cd9f -size 344553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e1b00b09f26 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4946addd472be2a75394db5df635af77341a08332274e65bb2ec2c9a5b7fdeab +size 468245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..507d158683d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d87f6aa6ea63ea2f2a747ddda6d67579bf2b64d3c2fa56d26dd44b92d657f3a +size 452259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1ab5aadb88f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac73afe3be3f78f535b41d45828f53ac410931cd861bb7a0f0954d1f6cf38fda +size 461827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b35f0f6ebb2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41445277c2bf6957f607e4018b0d08bcb2c81e7b6ff17cc1bad79f92adfbdc41 +size 373759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f2661ba6d54 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4b96a4324c33cd646cb4b81f7b15867cd6feb14688faaa13cf1454b7883488 +size 358365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bffe3383570 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc5c3f185a7d630077246772b4b8c72e1dfc9f71d5d7e189c34bfbd071547bcd +size 367885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 78c4cf16e1f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a925cd05c89a320013fd079a7b42b33365ab8f8cf49416a6203ef181c4ad554 -size 469039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3f8f4968241..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06ee7c2be6d573d3768a0656f21ed450022cc100fbf9a8046ea45538286fc7fc -size 453053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b0bfb0d3b6e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a66cb8117d8c6f9443a99f8199fcca39c50fa97a4c3f1ca41c68de83af28b5e7 -size 373763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d3952a2b008..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b494edb9003f0ccdf0b504799cad4d9e97def2645a71308e0c30b857e5b8a5f4 -size 359159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0a4cdbf2322 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94124d32592243dc83eee7ec5e966f1a3636fc319fea9da9cb9ed5c2a32b7425 +size 689952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7320b6358ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe3f995b418418bd8748ecb81353c40e73f36eac1c0545beec531cd7c2c1ed9 +size 573069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 199094805d1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a8fa50a8c6c96df653ea7c4e379c411cffe4cea2cf39c12b21a76a75b4c3235 -size 689956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index fb81af5e24d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a553dfb9045aa23ccca989de33328708e67cd0369d18b6968e8350e13be30c3 -size 573073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4cbd6f11093 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5d111e82ccb3183d1ac73f4f08e1c96f29378c6edef290496b2f056a29b5b2 +size 716546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7eb1b332c25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e8f13d8dac7b0a584b8c6282a6faa8292b14e3bb83703fc8ef4835fb35c51a +size 590191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 33f7e64c682..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:324af3ddbb3046e9493bb2d3e95355b2b8e6e3c1f1db077be8f047a1af5ea65c -size 716550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index e6b5b45d559..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16672d76149e35c37a21ef4ceca9fa0289ec4f33f4fc0eda45ae0f8ed927683e -size 590195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bbc49337ff1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91e67ad9dbeebdd76cbdd541153a27888c9947246d218261a533efc944887bfb +size 517337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..cd8538ede62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:025cbc34cbd3cf150972834e332802cc91da0a14325c74e6a4d4770eb5a8f93f +size 483147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0d59ae80249 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f65948ac4284af1d7a21fade652ac3fbe5c5a567c2cb7506e9e3bdfd525ec90 +size 510329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index fcd52c8431b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dedcb246712517575e1a67fa470bf5ae89850c0beba3c06e1bfac6271bff9405 -size 517145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index c917860bbe6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62dcdda9d091faf83f10fccf7aefe2cad9cdae58b76d068b6eb68790a38be9b7 -size 489565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..26b478f60e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8056dd7bf522a48f2377c0f99633468e8699a32e223643fa217a2bdc6da6987 +size 532981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp similarity index 81% rename from cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp rename to cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp index 14c55cc2cee..abd9c0eb902 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd920526c0f3a4c4cb2b378ec5ca1df5d467a1a24c2f5cf568e809ec95599287 -size 598231 +oid sha256:410ca3723462a7b710c8d18521e45f87b6d9946fa4f088719026fcf91337513c +size 497655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4535076c91c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaa376a4e5212397386b628ca208a67a67fd1b30a89287012fafece292ea37b8 +size 524885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9a69c16b338..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:126d6426f703cb20e56607f701dd1697a8d64ef040d2874829c18b5251e02768 -size 533773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index a75f808fc91..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88eac519ac2c242ae239ebba2fa6d8c741f5d6c4516b10816cc5210f2225931f -size 498499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..19b106812b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c432113c52951349c5c2465342899d03c2bdfe6136852d57df8474e415bbfb17 +size 415995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 1a9cba44d7c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ade7457167307acd98dba6abb06784cf89bf6937c02de2dbb32ebe37b115139 -size 415999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0bc0bc8ff74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac8b90c45b6140bae0a246ea8a0d0afa882e86ee6476192c51545a6d85c2175 +size 550035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..96a29aa8f5f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfcb15c2f8e38fd64c10170ace7c9c6562f33c4ff9d606eedacf8d734c550399 +size 428565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 294bd5221ad..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4bb4d4c57b1569929a3784c891165134768919755214c8580f73e952e6432e97 -size 550829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 09c4b1c4395..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe6491ab7d5a4575291dda2f641ee243d81f7819f260e1d2820ce53d89a7b8ff -size 429359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..c362afb8d9b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1ba03b73822507d34448444d00e93d198ef9798ae20a087a967aa499ba8a117 +size 433215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 4d78f8d8bbc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bce6634b17055fc43628210b96092dce7d7a4701c777294027de70a4c04e8289 -size 434009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ccb1dc62be6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56ff79d049d4b7e02afac97bac74c5a6bbbd92743c132e670b2c718e675c004b +size 574065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7b465ec2698 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d94c807e348e7270884a1951c72afb0a330e4e07604089ea8b331c429f75c3 +size 446575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 84b5d0193d0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:581a57bf7e3c36b2ab952d86f0630e561526de5d4ab13752feb13de9c1d82b3f -size 574859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 30c943deea0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00aa83431d7bf145b9fdf4c14a6b9519edde561dbdb8dc14883189780af9e943 -size 446579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..31d6abb2abc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da8739c98dc3042e663dee416feda613bd0911b92527052bf917b8a21686a213 +size 425759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..da5b89784b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e433d084eed732d9f9c3ccace3b2d75b84f4b0e6d335602c999c3c42db3328e +size 417815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c9b43d7af79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4bdfc9b152c36adcb0a72045e7f2853c9c7d73e0c6383860d1b81d5dd3c3b8 +size 420329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..44534baa9a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e629f56e2fb5f35f6f0780a22d73d2d284a2aec4283be6b4970682cc2cfcf00d +size 329745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e73134fa3df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:643a84538ace3ba24151b6c23367db623a2862a18418580f9b745b3627730f50 +size 322983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..69f0e27df06 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a38254634481fb65d09004698074b944a383c7cc5dfd3e4d8bfab34676f0ed +size 324561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 061cd4c2f89..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79f7468d46701f5ccb9388499ffdb701e334784c1b601c9adada021e9bfe1978 -size 425763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 972fb4b34ad..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64b3a8fb41338d911173070443ea22408127ab7250bee9a0bc9cae6f72b66a72 -size 418607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index cec6615968e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce59f29647046597d0fe0c7a16f5a0b91bcfb27342d996640c50d6aa0101a2da -size 330537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7b7ddf9dcb9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b159d8e5a58f315e3ebbd03fada70a0705eff784ab4586625cfd9bd40c9511e -size 322987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b204a8e9076 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c68450ca2893c342b3f67929d6504384ccec8e179832efd5f31007f110a78b +size 440267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2dbacbaae48 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c227ca255d091b0a7bbfe9c6a5b49b5fb383042466b6c0025ead0d1bbbebaf67 +size 433161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b72e58445a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab4b953ab070b03e36ecf0a7a8b7c962b5b9fe6e6da9145b3100642aa9a3003c +size 434837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6fb4cc6d1cb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49423b86b73b08a2b5f4ee6f6910c7d0fcf61ebd0d0b49c5f7933bdcd59dac2f +size 344301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9bc7291b183 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2fc3211b777b810badccee30eb6d884ed9259361f2472bbe1939c1cabc468d +size 336801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..abce39afbff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29330f7b7d24f2795521ea967d11b1ee26dfa9a65179e25a213cc4720dd2baf8 +size 338377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index e668575f218..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27cee5a328d19df6a3c6293c5fb112a560a79f053fae15c64efbc324a1394fca -size 440271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 20fbe2abb17..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0f88b6f08f66c1cd2e1c7a5849043093d68ffee028331a2080ae50a5da15f93 -size 433165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a073f3cf51e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55dcfeccb79b49d11d51e52b37863e8ed079eb887fff093efc9313e0f2c29195 -size 345095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e4493836b48..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b565e1f7b60c24c2ebf395de40290a23e298da305c3cfea69c6ca1868525ae2 -size 336805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..e3833f96e00 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4feb7e0caa4866f815c9ce047946faecc5093505ba05bc2b823ed9fecaccbad5 +size 320165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..8b0a0a0c747 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:012e57c2c21bff47dd7db3548a1de5189be60b83ff0d24d39a517704b97569fd +size 324321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 88c2615d0f1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a855aedf5af80fc2783fbfb1cd11f4cca11a4a0f83d6caba1b36e2690b27ad0 -size 320959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 4eda40202ff..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16daee862e65da33eeacbbf286307fba3cf8346d61ffb0e365f088191c33d254 -size 325115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..eb0a8ed8bcd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9b4172a3874ef7f0a7d22d82fe59db26a5f05e58c2b8960e27dad1edb92f884 +size 342763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..f8db579a011 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09b3a6782abe42635191c0990f0d0edea2b60a84b9bd65d66f93b539ea37263f +size 346969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index b109a9dee4f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f08b2ca8961f089842159d12e51d3bcdee753dda8bd664fd8589440a6a7afb3f -size 343557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 8bee01540af..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a2e92502ba7c4bb835f0e1ee6697ec87efa3aaef4d11c5c633d49ebde811ec28 -size 346973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e6a5f124157 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49995c11496cbe78ccd562bbc73eff372206e322a115e3a6348055564ce0e247 +size 532431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d967e3a87a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f5959aea6b63d8960772d6add103fc2949456e6fb90228359bdd253475a780e +size 419791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8f0d1d7e031..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:54268fec41c88f4d872c0b00226b17b0c7ac082900b21c58aca94abe6c3f9669 -size 533223 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 58cd78c49e7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb83d45c84ba77baa3e572b1161f7de2e188086f6b5cf0cf41e936ec8ee41479 -size 420583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0eb7978b6e9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550cd962cbf31aaa36d21e9bab824a77a18cf0ceac339ba0b573a556a75ac39d +size 553007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b106a72a99e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b8398a35ae64ee9522bfe4c76ab7687d4b45d9335a24e0e580dd806334a506 +size 442043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ef049ebfa7f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:298302772567ef50f1cf969336def0529dd62c47ec6608ed0a3cea43f0df7e0e -size 553011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1fb1bc262fb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33fbd9ab1822ab97f466e2898129534a7961cf1f59228bbfd32eb815ac25c75d -size 442837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..03a6ffbc013 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f7dbf5756b28e59b342057f2a6c22187a0c3c8835b952366ef74a661f36dc30 +size 551917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4f3bc4ddb7e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31901daa015d307c08c5988f119bff2366a05341278fdbb034bb4c8d360dcf47 +size 440955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 905cf636dd3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85abf7067a876e6cb331b3805aee0f97777d5c7dccc93098ba1b823a4e4b8844 -size 552711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4b5d970c8cb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d896cdccef267fd549c7401f6e4c9a10097bda7e99a3008ef4d454572e98f1c -size 441747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..745d30c9589 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d52610f772442634c0ea7b7f0049cea91bd51efec9ce0209e1aaf779d1182b3 +size 575107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..01b622aa6eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4213f96439a240f4006e58d227407dbfb6eb7550690727f96afb5dc404d97418 +size 463207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a6e1d9cff37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c26dbda7a6f04051fdabcf95f23147de7a002dd0a9b6656a962db980258a3463 -size 575111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3919ad08e61..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92d2b083e69d35bd4ac204d33215858128e5fe3c73c6f2273feb039de5332b6d -size 464001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ce07e49597c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56255020c5249428838b5393c04e1b03df65400d9a67c2b011960202a46d1557 +size 610771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..df1d7b4f097 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17cef2912e3d845f8ad1740421bf2b4d3a2b9a58af55e2e4970f8c34d2092da7 +size 478299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8d546414a46..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51245b6e72a51da03d5202fe84577e982c32483c505550daac1919d3ebed7e3b -size 611565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7471e145b6d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6c5f1489eae7675797329c63b034a0baa5be72473dcdd2f0b0454df63dd780b -size 479093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1239184b9ff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:516acbe2bbc741569d9e98958ab8b4cea8cf109031d345a63fe83425a264adf4 +size 638650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1f3a401c451 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1a339ab0be57dc7d01e888d137b9a7b153ca7e9c52df79a68a10660c599fc5c +size 500553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 471cdd7437f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf9bc3ed0fd96eec359855805a98d09c414cc0282cae03b8655a525c1aa97102 -size 638654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7fea5e25936..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a396d01af3a15af103f57ecbe79403c507fccc5b56262167f3177ec247d82934 -size 501347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..308091699e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f79f7cacd6c9b3cca72644ad32171b4d51c431615045173d8529ed0e289c649 +size 520781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f9e021c0111 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96c941cbbdaf49fe22cf1c5cdd84749334b3bf02000ee896ff5b0504d15c3870 +size 408981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9bd7f23e38b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:214c6dd737a8de9aeda043195d06ab0faf2c69d84abba0879e922658675c2690 -size 521575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3b5c8b6a247..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:59ce0423f423e3172040874238859c6f83b4fe9faf6688c742756368a64ce976 -size 409773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6e6c6524417 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f31543b025a2cecff4c4789e4cb59ba03730b79f8b683290145b8d6abf62648 +size 541703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f394125ddd7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b9ef5abdba97bfe51b0690b84799b34237306c16a7af873152bfd410ed39be8 +size 430445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1c689f4b891..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a083d3f8b34b6722aac0bd76b990e9f4c11862f0db26734f2c4895894d0cf540 -size 541707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 83fac9d2e78..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc4c66ec1f9c3a1e00b3bb80388183c3707ace82c4e5873a8b1e06587a6f168b -size 430449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6166ef32d92 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aea8139d53fa7d967b93936d98985bb30c2f149028350b8d17e9a5915c702d1 +size 523545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1ed176fbc53 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ff3bb0e270f5ae139d75851792926fd28b560ba675bdb6912296201aaf2e51 +size 412285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5bf17974943..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3eb4c544753b1e3540f6ff940017766e427f81bbcc9202615e712098134b9aa7 -size 524337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ec45698f91e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c19fded839bd2db8eaabfa526ba92747f515d87259ed1c92020bddcdeb928c8 -size 413079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..187a4670bef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad0edc2026b33862d4aafc31d1a46f26fc2549c59479a0e70f48caaa716ceb6 +size 548561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e58f9691547 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa447d978144c27b193d9d2043c49401a0676cbce1b4c41af9735ff94a914880 +size 440903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1311d3b6bcc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cbc822d64c072df485596d7dd4b328289e5b17e12c4c0afaebf16303a0fefb1a -size 548565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3cf07920e8a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1200dbf28c9aff0e13ef361e7dbff59712eac9927b03325e0765c7579142287c -size 440907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..e4c8bba073e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3a892bb7e48d0fc71e4883a16b83e922f1ec8e0aa33e593df0be9ff6ac330b +size 416791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index f75cf5ae8b8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:29617a1392e2d9cabec20dfce3dbc5ee8a5e64f8d749c3b4aab98a1295b2af34 -size 417585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..51071ddbe9e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01909be85fe25b0cdcadd3c5e9a9d1b33dbfef52fff65ade954a56b1572c38bf +size 440723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 75b533f4336..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a3aef0892235ecdb70d1f3ed387c37969f7119bde8fafc078e042ea11d56f22e -size 441515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..8ced2e7afe1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acf98ca04862901dd33af74179c414822afddb479d9dc01c50ac79a559c2db08 +size 400257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 66d6cf21286..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:649f0385e6283ea9ed3d38dcd01a38309ff7d96e0b682569157cde82ea831785 -size 401051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..5ccf97d204d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afbf35155b1d3dee42c6b2a2be89e7eedfa51f2d34c3ddbd43b50e30881fd7a5 +size 417479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 6e6b38216fb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10d59be965510258d34601bb89d2b841e1fb87e583681dd105e89c7fd2e70461 -size 418271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..12cb51e2aa3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea7556900e38ab67b1c468a9a8a77a477c10682df4d5ff05c65f516af089b51f +size 320955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..df63a6949cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf20f00ced999b623d173c8604cdab128a3a0a2144bf03d30d70221f44b9b95a +size 325111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 31485e566d3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c5e8cfc58a7325a33a82c6eeee5a091503ec112bfd0d3707a1d4393259bb21ef -size 321747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 0bbaf505ff1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37cbe4331afc350757181df86269190a1bd6630506a055bcf4c0a64a02c93feb -size 325903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..02b8746910c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d85c137dd80f2c42eea8a0970c23bf94abb57f19ed1eebd28cc5f71b3c4f83 +size 342763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..f9efcb42055 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33f957aa8376615ae9726e303a45831940b96ee5bffa23932476b9ba9d3c451e +size 347759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index e2dba12a2f7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ececb610b5c12e738d19afd52058a19dc98eafb20c0bc9f32d600b8ec7ffe80 -size 343557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 9c41e26e4ee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:369c9ed30970016d80c20ad831e768a0761456af2e40680c164cad5a63c8803c -size 347763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..991e2196d74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9e567ccafb2a5861f04fb67382b06b9459664dc62c7433a505c61593744e2da +size 831558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 17cabecc582..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e291cd6a7241705592a3d7385b646818d26edfd8831edd16ba3902edef8e803 -size 830722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6c7139cdc66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6871ea837aa9789bfd04caa07f3ac537454398963e3eaab43851228ace362c71 +size 854254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 28232b254ea..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:211a605616adb431a3ecadf4a060f1eda33e9f0c423bd5a0d9c0529a2d65a242 -size 850262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a6ec9d28fcb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d2d112d7309242d76e5da49ca2061b94f4e56143a7bb92566c8de0b5eb931d +size 834222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index a35a93f4c19..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4904a2711ab04dd79cc73de2e8d1de55445ae0bd8987c43362f44974473fcb79 -size 834176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a492aef7ebc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ca4bcc23e7eee7a88d43a2259495542bd4b6c7cbe42962103f598c6c10e0c8 +size 858694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6b2e8f5d863..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1be6eba16b7fc04c17fcfc9c800f7b357f35d058afcae78e33ef952a1c1c53af -size 858846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2fde935e119 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bddf944a455f3a5c7188a517c49f501d77ef702730d30568ef914a734be39105 +size 617125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8dba097c0da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860d22d193b7ee80cf280cb409764405dda082d4e6f082a99c3bc573465f93ec +size 488107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7a8839c2d11..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6ecd4c0f3aefe92debbb346298c9d5b0f31dfc8a7a4328f8ff0420cca7f49c8c -size 617919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b1e00a41866..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6089702b5307d5a8f80503dea35a5cf77c386c7abbc137cd3f2dc0ccada52f98 -size 488901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8c8e0459c98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:763cca7a183e637a4acc5ecc4de86ed556e1d97bff5679d864e43c4af9a3156d +size 661428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ac67aec1aff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aab18def713270dfd87e1fc16155d5848c71152319e9c463f1a8f0c393a1c3a +size 543509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f8778483b05..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8a4eef21d1f5f505a9c7b90231e6c0b6aac13fa36679ba2000636e64bd1f086 -size 659114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1646455a379..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:056d742caece8597db1ed575686c42eb69b01d20de7aa3b352332fa11ecce150 -size 541539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7b164c6d62c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:872a444e6de61e1edf9fe70a385bf44c1f99445a4fd7f71a624f9c759fa57814 +size 640614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f971bf6e079 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca7b11ea1d9892db7a5941f9801e4a3cd2c4de469fc13bfa086449ddcaaba621 +size 511989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 06bc162f06b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b91bfb5eb509f688160a8a31a021e95402674593983dd7cce9075cba60365868 -size 641406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2c98a225efd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d9f241eb393193779d4d29645d0d76fc7c3f9980b4dfd2a5f440e4f88d097ff4 -size 512781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8529643ab3d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c0b44ac1d879db002cf0235d71d1bd64931e2cc8fd96d4c37120cbf96cb74fc +size 682942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..91b93e4fafa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70c84a354ed22d0cf11f323c3c1d396ecef0d72e2d3296da71666995da625194 +size 567439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4870829d9ac..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a76156cbf09758c97d47001e050935639763c4114f50471be303dabf2d3d017 -size 681318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ad8f18dbe0b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac03cf5559dfd1db99badec8ee50e3aa86d9fc2fdf6642eb3d0171afc691ef82 -size 565469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bc062002c4c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56783775148cf30b8a048014d95e91953126f804a1ca83e0a3d068c81de27ef5 +size 1109756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 677d17f172a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89598f846c12882410f0453f57f7a0f2a14213b297046674364d6db252d533c8 -size 1109514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4ba5a9ede0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0db342b69eaecd8d7ea7175105c3d10d458c884d8db044a524fa0e03ee26263 +size 1135754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index e9314aa1e37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a731ec3c224108e3e137b7b42624c5e0214a1faca314b36a8adc4ee14fe463aa -size 1139162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..92970110c27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6638acb7b8d68dd72aa434152a316ec7364e48d0fa22372906b602d91cc005df +size 655854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..48c62236297 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a422c18bb2432853389426d9ccd41e8187580ae49c22ec393717ebebb251d343 +size 526685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2d77561debd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8294f0a3f32657c4b6ca95a41fdd3a3492c7eb52e37868fe9ca01c2ef85b0602 -size 656646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index fdba2629d84..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b5e28f946891a59746511c7621935b1dfa43e1adb1c59e84df601560a353fac7 -size 527479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b2bd90bb3f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47bfb33604cb45ce46e1e2ab429d54630a495f9227497e26241e52983a130fa +size 692754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b0f32f5a4d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f516bb3b95350448023ff936ebce3254f12de61a9039e00a2bcee56564d60d +size 582285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8dfbef9eba2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b6b1b5d6bdc072175f06254185ba1cb0eab03f56c9b892348607c9be996b5bf -size 690834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b3a4e74b65b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:809f0571c44a92819b81431764a13177393d5461ff6708cf6d450c17a25d754f -size 581055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..504e51281ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:783fb436aa76871baa0a277f1b035caf3290cb34012962b643e072973c39053c +size 679242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0474121e42f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b45b3b5b53c8c5e84846bc3ce88a2c3e2af0e6e7be2e906b25d2cfc4097e0f7 +size 551357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3c0c9a8b08f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06f9f6a6d09217f1b1fd6268d13c718bfa14239067b78239ec5aef735e5819f3 -size 680034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b2f339dc64f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5837bfa375ab5e529cd1630769792f6d335b8fd7ad690de830aff6149b61e897 -size 552149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..cbe9a516f67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ad3e72d06e7554cdb57a121d5d922082702de3743382b7acc1c6d05335445d5 +size 715994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..74d355f006f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4213c2de32468dc9bc2f17a6cf64e2db0b35ecb146a9aa98ddac1a39d5cb0d74 +size 606165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5d22b4647a8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8bca5f70c58ab1adfc60c97a6d337362ffc31a9e0120da042b91c30a7a7091ba -size 716690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 56313d9d2b2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:43bf6a8a96b2640d2cefcda6d8ee573129e5a72e6c4b7180b953174c76c7af45 -size 604937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..747f0363192 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ec4d620edd090e5c07c104f286aaf49d1e0a4d493dbbbe819f3df4466aca4e3 +size 750696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp deleted file mode 100644 index 935efa96631..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9bd4e30634eedce2886237ca75a6d89cf4377390910293808d37bf87f64858da -size 753710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e23f563ae56 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c21209ee766957676d0dc53b13d539eeba7a1e737b0505f867527a0665ddb17 +size 775120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp deleted file mode 100644 index ab070b7c301..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b644888468d61bbb6666e036c5a3f24a7d536cf82f3d93d133202992571f9db -size 772806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..cb123cd8788 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6700da878680109e7f12ffd877d49e37b08ec98499059e8e6a6515e0a9ec97dd +size 792530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp deleted file mode 100644 index 00f56f5896f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa7b778235a2cf818fdbc38be6f76be437dae3643bdd44ce32fa53e2134124bb -size 793028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1a9cb217e0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca23a4432d274c9713e1178bb9bed8d1f13456a724bf382fb08247a834afe486 +size 815474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1fb84fd3a59..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:291a9135a62ae4607f42b402021e27cad2fd600ba6de7f0f11c3ae8ce2b1ac17 -size 814542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9824b0b3f28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a675ad725e77c30c4e7a323a636e941dca52c2db8559527096942e1e6df8f48 +size 594327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fca631a89f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5f11eb6c0b65044c1f81f4096b8c7dd8631520cc1ffae133db9ab9921894b89 +size 481293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..61123e208ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7465cd5b944c1a554c6e8634c27e67c66d2d5d98f6171d7487f2257f24c600f7 +size 693308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5327401625c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15fac533d241fd8d173cad175a8d4d675a36eecec33a89798d8cdff1b907acb9 +size 539819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 016d779d4de..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66f927d410f569a25d513eaacd6f70b0a5bee3a53d5e2cbacabf167aac7b1ccc -size 594331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6b499e10c64..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dd591d1e19e7624c14ade126069eba3e44cab22a6cae492fe21595c49ef9ccb2 -size 482087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3dbe7e3c5e6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5b95fae5000e87d8bf7e91e17756b3b713b47ec7d69ac8a2ca8ed82861d6af7 -size 694100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 01f450841cf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5e52575b8b1127488fedfcc6647e8ea916ed5146ea83d13709576b98913296c -size 539823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..27eb2803b62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a852256e8857d8a9e2d644269d97d6e0b98d7695c024dd838cab1052053a97 +size 620676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..45c42874540 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:442456ee1da37a612319bb47f6c57769794a8472df6873a8ea61d9c8ed941be5 +size 505767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0a85950c002 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96278d994f3784eb8d7a1206e4a04aeca104fdd00a4d7f27c0c63ef482967e9b +size 716400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3603b6ab4c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19d373291d7f87412b2c539d3bc77d47da0323436ce5fcc18cbc29c2f1334ff5 +size 564587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d73e263f70e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22c25c6d9424cb4ec7375b7d8bfe8b02e9bc4264b884a3099f3e529047557687 -size 620680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 79d23132077..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d8bf59455ec05ecb70753f6a7660f99a0128874173a51e9599f3cdaa858056e3 -size 506559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 02ef6c3769f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd529dd26745718b4e52ae91c680c03cde6893160f6ff6e7fc29d2dd88d4ca9 -size 716404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5acf00ad3eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2b496a1d669776890ddebd78cfb97eda189fd2db4879534d28c902476097ddb -size 564591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..12c11d9902c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99fe2e8a8e8ccb75e75afb2ca51249c6150254dfa52395e4d0c8ec6dbfd04904 +size 593095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6793d69e5be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9426de36443e751ed6d0f995c455b547dbdc9064df1b9efa3882e0ca6452ccd7 +size 475127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..39f6d58519a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fb5879e4f17b68e6a0f5a87f95aa638de2e75de15dfc915c2f042eafbd9c7da +size 684328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..56a5ad062d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f0551c4f64c40da735a443007bc087a62e389e3f3e1a253942625f7525f788d +size 534195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8b7b4094038..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f0f7ad67622d852200644b06c9c7831de89d12504ba1e0a7bcd710473296fde7 -size 593887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 818c6704d6a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a7a0518c231c51981c51f20aaf5e045f1669f4ddf7da50cfc857ceeb874c4a3 -size 475919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index e83de87bfdb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:59df568e46029979013b1fc17349135db449f46057c3eca4263dfde15ae599ba -size 685122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a6cb177e2f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f1463584c253e0f4791975bb0275c188e2fc6b0fb671ce5ac05b006550d7e6d9 -size 534987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9b13ebf0f9d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1b8d17dbeefb8ef0fdcc6270e152b5ebb069afdd36c78a46e20af3a76ec1d95 +size 616483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0c70fd874bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a6cc1cb1e00e86c52b5d5bc4790567ec847d794f7790e2a68cb997c0eef20bc +size 505125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9850946f672 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae652545e98b56be68e1268f80c4ec1a0f887a41b5e5bb400d87e2e8ea856256 +size 712304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..22fd4df6c7c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c9f73acdf5dc39206fa9ae1b92e4c4f4e45bf9b19c6b761edb860af9d248b9c +size 565131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ed538dd1f6d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca797a09f7cba755766c62dee60ee9d1b2849b8a08cbba1efd60b3331e790f31 -size 617275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ac603e42e9a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51189ce4fe72ab9a4a9bcb90a798e1e9e9c8cc7388b3e2593347bcf909db01a5 -size 505919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d73f19c11e5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d03954e0e9d6d51e16cdeeea4632d20a57dfb6579614af0d71e8f22f40c87eaa -size 713098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1c1c6024647..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1be8d4dcae4b6cd618993a8261c9fc11456146b40187b0fd4db48f7412d310f9 -size 565923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h index dba18f1c759..c0070eead40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaKernelLauncher.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaKernelLauncher.h index 26e9d2d5122..b74d13476d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaKernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaKernelLauncher.h @@ -33,10 +33,70 @@ namespace gen //////////////////////////////////////////////////////////////////////////////////////////////////// #ifdef TLLM_ENABLE_CUDA +inline CUresult launchKernelFlexibleCgaSizes(void* kernelParams, void* cudaStream, int32_t smemSize, CUfunction kernel, + dim3 block3, dim3 grid3, dim3 cluster3, dim3 fallbackCluster3, bool enablesPdl) +{ + // Make sure we can launch with that much shared memory. + // Note: those function-level settings are actually ignored as we use per-launch attributes. + if (smemSize > 48 * 1024) + { + CUresult result; + result = cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smemSize); + if (result != CUDA_SUCCESS) + { + return result; + } + } + + auto clusterDim = cluster3.x * cluster3.y * cluster3.z; + + CUlaunchConfig launchConfig; + launchConfig.blockDimX = block3.x; + launchConfig.blockDimY = block3.y; + launchConfig.blockDimZ = block3.z; + launchConfig.gridDimX = grid3.x; + launchConfig.gridDimY = grid3.y; + launchConfig.gridDimZ = grid3.z; + launchConfig.hStream = reinterpret_cast(cudaStream); + launchConfig.sharedMemBytes = smemSize; + + CUlaunchAttribute launchAttrs[4]; + launchAttrs[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + launchAttrs[0].value.clusterDim.x = fallbackCluster3.x; + launchAttrs[0].value.clusterDim.y = fallbackCluster3.y; + launchAttrs[0].value.clusterDim.z = fallbackCluster3.z; + launchAttrs[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; + launchAttrs[1].value.clusterSchedulingPolicyPreference + = (clusterDim > 1) ? CU_CLUSTER_SCHEDULING_POLICY_SPREAD : CU_CLUSTER_SCHEDULING_POLICY_DEFAULT; + launchAttrs[2].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; + launchAttrs[2].value.programmaticStreamSerializationAllowed = enablesPdl; + launchAttrs[3].id = CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION; + launchAttrs[3].value.preferredClusterDim.x = cluster3.x; + launchAttrs[3].value.preferredClusterDim.y = cluster3.y; + launchAttrs[3].value.preferredClusterDim.z = cluster3.z; + launchConfig.attrs = launchAttrs; + launchConfig.numAttrs = 4; + + // Add setting for non-portable cluster size. + { + CUresult result = cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, + 1 // Enable non-portable cluster sizes + ); + if (result != CUDA_SUCCESS) + { + return result; + } + } + + // Launch the kernel. + return cuLaunchKernelEx(&launchConfig, kernel, &kernelParams, nullptr); +} + inline CUresult launchKernel(void* kernelParams, void* cudaStream, int32_t smemSize, CUfunction kernel, dim3 block3, dim3 grid3, dim3 cluster3, bool enablesPdl) { // Make sure we can launch with that much shared memory. + // Note: those function-level settings are actually ignored as we use per-launch attributes. if (smemSize > 48 * 1024) { CUresult result; @@ -69,8 +129,8 @@ inline CUresult launchKernel(void* kernelParams, void* cudaStream, int32_t smemS = (clusterDim > 1) ? CU_CLUSTER_SCHEDULING_POLICY_SPREAD : CU_CLUSTER_SCHEDULING_POLICY_DEFAULT; launchAttrs[2].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; launchAttrs[2].value.programmaticStreamSerializationAllowed = enablesPdl; - launchConfig.attrs = launchAttrs; launchConfig.numAttrs = 3; + launchConfig.attrs = launchAttrs; // Add setting for non-portable cluster size. if (clusterDim > 8) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h index 7b136dad2e7..5677e1496ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h @@ -93,10 +93,11 @@ inline std::string mmaKindToString(MmaKind mmaKind) //////////////////////////////////////////////////////////////////////////////////////////////////// -// Get the TMEM column stride per group (i.e. kGroupSize * blockSize K elements) -inline int32_t getTmemColStridePerGroup(int32_t tileMn, int32_t mmaK, int32_t kGroupSize) +// Get the TMEM column stride per group. +// A group is one or more MMA instructions that share the same TMEM columns. +inline int32_t getTmemColStridePerGroup(int32_t mmaMn, int32_t mmaK, [[maybe_unused]] int32_t kGroupSize) { - int32_t colStride = 2 * ceilDiv(tileMn, 64); + int32_t colStride = 2 * ceilDiv(mmaMn, 64); if (mmaK == 96) { colStride = std::max(4, colStride); @@ -106,6 +107,8 @@ inline int32_t getTmemColStridePerGroup(int32_t tileMn, int32_t mmaK, int32_t kG //////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gen } // namespace trtllm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h index 72d0e1a259a..98591b0b502 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h @@ -70,6 +70,7 @@ enum class SfLayout // I.e., the SF buffer is a tensor [⌈m/128⌉, ⌈n/b/4⌉, 32, 4, 4] // The SF for the element (i, j) is stored at (i/128, j/b/4, i%32, (i%128)/32, (j/b)%4). R128c4, + }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -88,6 +89,13 @@ inline std::string sfLayoutToString(SfLayout layout) //////////////////////////////////////////////////////////////////////////////////////////////////// +inline bool sfLayoutCanUseUtccp(SfLayout layout) +{ + return (layout == SfLayout::R128c4); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gen } // namespace trtllm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu index d750cd8f41e..fcc12ceab7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu @@ -240,11 +240,18 @@ tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions( } else { + EltwiseActType eltwiseActType = EltwiseActType::None; + switch (actType) + { + default: + case ActType::Relu2: eltwiseActType = EltwiseActType::Relu2; break; + case ActType::Silu: eltwiseActType = EltwiseActType::Silu; break; + } options = { .dtypeA = dtypeWeights, .dtypeB = dtypeAct, .dtypeC = dtypeAct, - .eltwiseActType = EltwiseActType::Relu2, + .eltwiseActType = eltwiseActType, .deepSeekFp8 = useDeepSeekFp8, .fusedAct = false, .routeAct = true, diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 9c98f0215cf..016746b6a1f 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -241,11 +241,11 @@ def createKubernetesPodConfig(image, type, arch = "amd64") resources: requests: cpu: '2' - memory: 10Gi + memory: 20Gi ephemeral-storage: 25Gi limits: cpu: '2' - memory: 10Gi + memory: 20Gi ephemeral-storage: 25Gi imagePullPolicy: Always""" nodeLabelPrefix = "cpu" diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py index 48c22c76863..8ac2c90d83b 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py @@ -306,11 +306,11 @@ def trtllm_moe_fused( ) else: # For non-gated MLP with ReLU^2 - if act_fn == ActivationType.Relu2: - activation_type = ActivationType.Relu2 + if act_fn in [ActivationType.Relu2, ActivationType.Silu]: + activation_type = act_fn else: raise ValueError( - f"Unsupported activation '{ActivationType(act_fn).name}' for mlp. Use 'relu2'." + f"Unsupported activation '{ActivationType(act_fn).name}' for mlp. Use 'relu2' or 'silu'." ) mapping, enable_alltoall = _check_moe_alltoall(mapping_config, max_num_tokens) @@ -363,10 +363,10 @@ def trtllm_moe_fused_fake( def _validate_mlp_style_and_act_fn(is_gated_mlp: bool, act_fn: int) -> None: assert (is_gated_mlp and act_fn in [ActivationType.Silu, ActivationType.Swiglu]) or ( - not is_gated_mlp and act_fn == ActivationType.Relu2 + not is_gated_mlp and act_fn in [ActivationType.Relu2, ActivationType.Silu] ), ( f"Unsupported combination: is_gated_mlp='{is_gated_mlp}', act_fn='{act_fn}'. " - f"Supported combinations: gated mlp with silu or mlp with relu2." + f"Supported combinations: gated mlp with silu or mlp with relu2 or silu." ) @@ -410,7 +410,7 @@ def trtllm_quant_fp8_moe_fused( fc2_act_scale_reciprocal: FC2 activation scale reciprocal (scalar) fc2_dequant_scale: FC2 dequant scale [E] is_gated_mlp: True for gated_mlp, False for mlp - act_fn: ActivationType.Silu for gated_mlp, ActivationType.Relu2 for mlp + act_fn: ActivationType.Silu for gated_mlp, ActivationType.Relu2 or ActivationType.Silu for mlp Returns: Output tensor of shape (B, H) or (B, S, H) @@ -551,7 +551,7 @@ def trtllm_quant_nvfp4_moe_fused( fc1_alpha: FC1 dequant scales = 1.0 / (fc1_act_global_scale * fc1_weight_global_scale) fc2_alpha: FC2 dequant scales = 1.0 / (fc2_act_global_scale * fc2_weight_global_scale) mlp_style: "gated_mlp" or "mlp" - act_fn: "silu" for gated_mlp, "relu2" for mlp + act_fn: "silu" for gated_mlp, "relu2" or "silu" for mlp """ # Validate block scale tensors are 3D (padding requirements handled below) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py index 956542fd6ff..149e7fecf85 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py @@ -283,6 +283,8 @@ def _to_trtllm_gen_activation_type(self, return 0 elif activation_type == ActivationType.Relu2: return 1 + elif activation_type == ActivationType.Silu: + return 2 else: raise ValueError(f"Unsupported activation type: {activation_type}") @@ -340,8 +342,9 @@ def _get_quant_method(self): return DeepSeekFP8BlockScalesFusedMoEMethod() elif self.quant_config.layer_quant_mode.has_nvfp4(): return NVFP4TRTLLMGenFusedMoEMethod( - ) if self.swiglu_alpha is not None or self.activation_type == ActivationType.Relu2 else NVFP4TRTLLMGenFusedMoEBaseMethod( - ) + ) if self.swiglu_alpha is not None or self.activation_type in [ + ActivationType.Relu2, ActivationType.Silu + ] else NVFP4TRTLLMGenFusedMoEBaseMethod() elif self.quant_config.layer_quant_mode.has_w4a16_mxfp4(): return W4A16MXFP4TRTLLMGenFusedMoEMethod() elif self.quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8(): @@ -573,7 +576,9 @@ def run_moe( # When output is provided, use it directly as the result final_hidden_states = moe_output if moe_output is not None else result elif self.has_nvfp4: - factor = 1 if self.activation_type == ActivationType.Relu2 else 2 + factor = 1 if self.activation_type in [ + ActivationType.Relu2, ActivationType.Silu + ] else 2 intermediate_size_per_partition_padded = self.w3_w1_weight.shape[ -2] // factor act_type = self._to_trtllm_gen_activation_type(self.activation_type) diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py index 2a349c28e03..bb374943bfc 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py +++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py @@ -2825,10 +2825,11 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict): # last step: load fc31_scale_c # c_global_sf: fc2_input_scale # For gated activations (SwiGlu), scale_c_fc1 includes both input and weight scales - # For non-gated activations (Relu2), scale_c_fc1 is just the input scale + # For non-gated activations (Relu2 or Silu), scale_c_fc1 is just the input scale from ...utils import ActivationType - if hasattr(module, 'activation_type' - ) and module.activation_type == ActivationType.Relu2: + if hasattr(module, 'activation_type') and module.activation_type in [ + ActivationType.Relu2, ActivationType.Silu + ]: # For Relu2: scale_c_fc1 = fc2_input_scale (broadcast to all experts) module.fc31_scale_c.data.copy_(module.fc2_input_scale.data.expand( module.expert_size_per_partition), diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py index 3c243346bb8..64387894d0b 100644 --- a/tensorrt_llm/_torch/utils.py +++ b/tensorrt_llm/_torch/utils.py @@ -54,6 +54,7 @@ class ActivationType(IntEnum): class ActType_TrtllmGen(IntEnum): SwiGlu = 0 Relu2 = 1 + Silu = 2 # IMPORTANT: when adding a new activation type, please update this function. diff --git a/tests/unittest/_torch/thop/serial/test_moe.py b/tests/unittest/_torch/thop/serial/test_moe.py index a1912def29b..53c70ee21c0 100644 --- a/tests/unittest/_torch/thop/serial/test_moe.py +++ b/tests/unittest/_torch/thop/serial/test_moe.py @@ -42,6 +42,7 @@ class ActType(Enum): SwiGlu = 0 Relu2 = 1 + Silu = 2 class moe_args: @@ -427,6 +428,8 @@ def run_moe_dequant(args, activation_output[i:i + my_num_tokens] = act * (beta + my_x1) elif args.act_type == ActType.Relu2: activation_output[i:i + my_num_tokens] = F.relu(my_x1)**2 + elif args.act_type == ActType.Silu: + activation_output[i:i + my_num_tokens] = F.silu(my_x1) i += my_num_tokens i = (i + args.padding - 1) // args.padding * args.padding @@ -1034,8 +1037,9 @@ class TestMoeFp4: @pytest.mark.parametrize("num_tokens", [1, 1024]) @pytest.mark.parametrize("hidden_size", [1024]) @pytest.mark.parametrize("intermediate_size", [1024, 768]) - @pytest.mark.parametrize("act_type", [ActType.SwiGlu, ActType.Relu2], - ids=["swiglu", "relu2"]) + @pytest.mark.parametrize("act_type", + [ActType.SwiGlu, ActType.Relu2, ActType.Silu], + ids=["swiglu", "relu2", "silu"]) @pytest.mark.parametrize( "routing_info", [ @@ -1161,8 +1165,9 @@ def test_autotune_fp8_fp4(self, num_tokens, hidden_size, intermediate_size, @pytest.mark.parametrize("num_tokens", [1, 150]) @pytest.mark.parametrize("hidden_size", [1024]) @pytest.mark.parametrize("intermediate_size", [1024]) - @pytest.mark.parametrize("act_type", [ActType.SwiGlu, ActType.Relu2], - ids=["swiglu", "relu2"]) + @pytest.mark.parametrize("act_type", + [ActType.SwiGlu, ActType.Relu2, ActType.Silu], + ids=["swiglu", "relu2", "silu"]) @pytest.mark.parametrize( "routing_info", [ @@ -1636,7 +1641,7 @@ def run_moe_fp4_test(self, scale_c_fc1 = args_dequant.c_global_sf * ( 1.0 / args.gemm1_scales_global) * ( 1.0 / args.hidden_states_scale_global) - elif act_type == ActType.Relu2: + elif act_type in [ActType.Relu2, ActType.Silu]: scale_c_fc1 = torch.full_like(args.gemm1_scales_global, args_dequant.c_global_sf) # self.fc31_alpha @@ -1686,7 +1691,7 @@ def run_moe_fp4_test(self, do_finalize=True, topk_ids=topk_ids, topk_weights=topk_weights, - act_type=1 if act_type == ActType.Relu2 else 0) + act_type=act_type.value) torch.cuda.synchronize() output_dequant_actual = output[0].to(torch.float) @@ -1697,7 +1702,7 @@ def run_moe_fp4_test(self, else: atol = 0.1 rtol = 0.85 - percent = 0.925 + percent = 0.9 check_accuracy(output_dequant_reference, output_dequant_actual, From d3a16b329840d06e4de46c6c64423144e390dc44 Mon Sep 17 00:00:00 2001 From: Guiju Zhang <7135567+cascade812@users.noreply.github.com> Date: Mon, 9 Mar 2026 10:13:31 -0700 Subject: [PATCH 4/9] [TRTLLM-11045][feat] Integrate SA with EAGLE3 and PARD (#11878) Signed-off-by: Guiju Zhang <7135567+cascade812@users.noreply.github.com> --- docs/source/features/speculative-decoding.md | 71 +++++++++++ .../_torch/pyexecutor/model_engine.py | 34 +++-- tensorrt_llm/_torch/speculative/__init__.py | 2 + tensorrt_llm/_torch/speculative/eagle3.py | 60 +++++++-- tensorrt_llm/_torch/speculative/mtp.py | 86 ++++++------- tensorrt_llm/_torch/speculative/pard.py | 45 +++++++ .../_torch/speculative/sa_enhancer.py | 120 ++++++++++++++++++ tensorrt_llm/_torch/speculative/sa_worker.py | 1 - .../_torch/speculative/suffix_automaton.py | 83 +++++++++--- tensorrt_llm/_torch/speculative/utils.py | 29 ++++- tensorrt_llm/llmapi/llm_args.py | 24 +++- .../defs/accuracy/references/gsm8k.yaml | 6 + .../defs/accuracy/test_llm_api_pytorch.py | 53 +++++++- .../test_lists/qa/llm_function_core.txt | 2 + 14 files changed, 518 insertions(+), 98 deletions(-) create mode 100644 tensorrt_llm/_torch/speculative/sa_enhancer.py diff --git a/docs/source/features/speculative-decoding.md b/docs/source/features/speculative-decoding.md index cc55736a171..d1673deaffe 100644 --- a/docs/source/features/speculative-decoding.md +++ b/docs/source/features/speculative-decoding.md @@ -48,6 +48,8 @@ speculative_config = Eagle3DecodingConfig( llm = LLM(model, speculative_config=speculative_config) ``` +EAGLE 3 can be combined with the [Suffix Automaton enhancement](#suffix-automaton-sa-enhancement) for improved acceptance rates on repetitive content. See the SA section below for details. + ### NGram The NGram method is an implementation of [this Prompt Lookup Decoding algorithm](https://github.com/apoorvumang/prompt-lookup-decoding). @@ -88,6 +90,29 @@ speculative_config = MTPDecodingConfig( llm = LLM("/path/to/deepseek_model", speculative_config=speculative_config) ``` +MTP can be combined with the [Suffix Automaton enhancement](#suffix-automaton-sa-enhancement) for improved acceptance rates on repetitive content. See the SA section below for details. + +### PARD + +PARD (PARallel Draft) is a target-independent speculative decoding method that predicts all draft tokens in a single forward pass using mask tokens. Unlike MTP or EAGLE 3 which generate drafts one token at a time, PARD produces K draft tokens in parallel. + +Reference: [PARD: Parallel Drafting for Speculative Decoding](https://arxiv.org/pdf/2504.18583) + +* `max_draft_len`: Maximum draft candidate length. +* `speculative_model`: Path or HuggingFace model ID for the PARD draft model. +* `mask_token_id`: Token ID used as the mask token for parallel prediction. If not set, it is read from the draft model config. + +```python +from tensorrt_llm.llmapi import PARDDecodingConfig + +speculative_config = PARDDecodingConfig( + max_draft_len=4, speculative_model="/path/to/pard_model") + +llm = LLM("/path/to/target_model", speculative_config=speculative_config) +``` + +PARD can be combined with the [Suffix Automaton enhancement](#suffix-automaton-sa-enhancement) for improved acceptance rates on repetitive content. See the SA section below for details. + ### User-provided drafting A completely user-defined drafting method can be supplied with a `UserProvidedDecodingConfig` that includes * `max_draft_len`: Maximum draft candidate length. @@ -103,6 +128,40 @@ speculative_config = UserProvidedDecodingConfig( llm = LLM("/path/to/target_model", speculative_config=speculative_config) ``` +## Suffix Automaton (SA) Enhancement + +The Suffix Automaton (SA) is a model-free, GPU-based pattern-matching draft enhancer. It finds suffix matches in previously generated tokens and proposes draft tokens when the match is long enough. SA is very accurate when it matches (exact pattern repetition), while neural methods are better for novel content — combining them gives the best of both worlds. + +SA can be combined with the following speculative decoding techniques: + +* **MTP** (`MTPDecodingConfig`) +* **EAGLE 3** (`Eagle3DecodingConfig`) +* **PARD** (`PARDDecodingConfig`) + +To enable SA combination, set `use_sa_spec=True` on the speculative config. The `sa_spec_threshold` parameter controls the minimum suffix match length required to override the neural draft (default: 4). + +```python +from tensorrt_llm.llmapi import Eagle3DecodingConfig + +speculative_config = Eagle3DecodingConfig( + max_draft_len=4, + speculative_model="/path/to/eagle3_model", + use_sa_spec=True, + sa_spec_threshold=4) + +llm = LLM("/path/to/target_model", speculative_config=speculative_config) +``` + +SA can also be used as a standalone speculative decoding technique via `SADecodingConfig`: + +```python +from tensorrt_llm.llmapi import SADecodingConfig + +speculative_config = SADecodingConfig(max_draft_len=4) + +llm = LLM("/path/to/target_model", speculative_config=speculative_config) +``` + ## Usage with `trtllm-bench` and `trtllm-serve` ```{eval-rst} @@ -117,6 +176,8 @@ Speculative decoding options must be specified via `--config config.yaml` for bo * `Eagle3` * `NGram` * `DraftTarget` +* `PARD` +* `SA` > Note: The PyTorch backend supports only `Eagle3`. `decoding_type: Eagle` is accepted as a backward-compatible alias for `Eagle3`, but EAGLE (v1/v2) draft checkpoints are incompatible. @@ -138,6 +199,16 @@ speculative_config: speculative_model: /path/to/draft/model ``` +```yaml +# SA combination: enable Suffix Automaton enhancement with any supported technique +speculative_config: + decoding_type: Eagle3 + max_draft_len: 4 + speculative_model: /path/to/draft/model + use_sa_spec: true + sa_spec_threshold: 4 +``` + ```{note} The field name `speculative_model_dir` can also be used as an alias for `speculative_config.speculative_model`. For example: diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 1282e6c92cd..c2cfe9c68f3 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -3,7 +3,6 @@ import functools import gc import inspect -import itertools import math import os import weakref @@ -3489,21 +3488,28 @@ def _prepare_inputs( raise NotImplementedError( f"Unsupported cp_type {getattr(cp_type, 'name', cp_type)}.") - # Initialize SA state for new requests (MTP+SA path) + # Initialize SA state for new requests (MTP+SA, EAGLE3+SA, PARD+SA, etc.) use_sa_spec = (self.spec_config is not None and getattr(self.spec_config, 'use_sa_spec', False)) - if (use_sa_spec and spec_metadata is not None - and hasattr(spec_metadata, 'sa_manager') - and spec_metadata.sa_manager is not None - and self.mapping.is_last_pp_rank()): - sa_manager = spec_metadata.sa_manager - for request in itertools.chain( - scheduled_requests.context_requests, - scheduled_requests.generation_requests): - if request.py_request_id not in sa_manager._initialized_requests: - sa_manager.add_request(request.py_request_id, - request.get_tokens(0)) - sa_manager._initialized_requests.add(request.py_request_id) + if use_sa_spec and resource_manager is not None and self.mapping.is_last_pp_rank( + ): + from tensorrt_llm._torch.speculative.suffix_automaton import \ + SuffixAutomatonManager + spec_rm = resource_manager.get_resource_manager( + ResourceManagerType.SPEC_RESOURCE_MANAGER) + sa_manager = None + if spec_rm is not None: + if isinstance(spec_rm, SuffixAutomatonManager): + sa_manager = spec_rm + else: + sa_manager = getattr(spec_rm, 'sa_manager', None) + if sa_manager is not None: + for request in scheduled_requests.all_requests(): + if request.py_request_id not in sa_manager._initialized_requests: + sa_manager.add_request(request.py_request_id, + request.get_tokens(0)) + sa_manager._initialized_requests.add( + request.py_request_id) return self._prepare_tp_inputs( scheduled_requests, kv_cache_manager, attn_metadata, spec_metadata, diff --git a/tensorrt_llm/_torch/speculative/__init__.py b/tensorrt_llm/_torch/speculative/__init__.py index 4771380ea3b..220e6156ea8 100644 --- a/tensorrt_llm/_torch/speculative/__init__.py +++ b/tensorrt_llm/_torch/speculative/__init__.py @@ -7,6 +7,7 @@ from .mtp import MTPEagleWorker, MTPSampler, MTPSpecMetadata, MTPWorker from .ngram import NGramDrafter, NGramPoolManager from .pard import PARDSpecMetadata, PARDWorker +from .sa_enhancer import SADraftEnhancer from .sa_worker import SASampler, SASpecMetadata, SAWorker from .save_hidden_state import (SaveHiddenStatesResourceManager, SaveHiddenStatesSpecMetadata) @@ -31,6 +32,7 @@ "NGramPoolManager", "PARDSpecMetadata", "PARDWorker", + "SADraftEnhancer", "SASampler", "SASpecMetadata", "SAWorker", diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py index a09e6075258..71e8381067f 100644 --- a/tensorrt_llm/_torch/speculative/eagle3.py +++ b/tensorrt_llm/_torch/speculative/eagle3.py @@ -14,6 +14,7 @@ from ..pyexecutor.scheduler import ScheduledRequests from .interface import SpecMetadata, SpecWorkerBase from .mtp import MTPSampler +from .sa_enhancer import SADraftEnhancer from .spec_tree_manager import SpecTreeManager if TYPE_CHECKING: @@ -27,14 +28,21 @@ class Eagle3ResourceManager(BaseResourceManager): and one for the draft model. Use this class to manage the hidden states. """ - def __init__(self, config: "EagleDecodingConfig", dtype: torch.dtype, - hidden_size: int, max_num_requests: int, max_seq_len: int, - max_num_tokens: int): + def __init__(self, + config: "EagleDecodingConfig", + dtype: torch.dtype, + hidden_size: int, + max_num_requests: int, + max_seq_len: int, + max_num_tokens: int, + sa_manager=None): self.dtype = dtype self.max_draft_len = config.max_draft_len self.hidden_size = hidden_size self.max_num_requests = max_num_requests self.max_seq_len = max_seq_len + # Optional SA manager for EAGLE3+SA mode + self.sa_manager = sa_manager # There could be dummy request for padding batch when using CUDA graph. # Reserve one more slot for the dummy request. slot_size = self.max_seq_len + 1 @@ -94,13 +102,18 @@ def free_resources(self, request: LlmRequest): self.seq_lens[slot_id] = 0 self.start_indices[slot_id] = 0 self.slot_manager.remove_slot(request.request_id) + if self.sa_manager is not None: + self.sa_manager.remove_request(request.request_id) def add_dummy_requests(self, request_ids: List[int]): for rid in request_ids: self.slot_manager.add_slot(rid) + if self.sa_manager is not None: + self.sa_manager.add_dummy_requests(request_ids) def shutdown(self): - pass + if self.sa_manager is not None: + self.sa_manager.shutdown() def get_max_resource_count(self) -> int: return self.max_num_requests @@ -298,6 +311,8 @@ class Eagle3OneModelSpecMetadata(SpecMetadata): dtype: torch.dtype = torch.bfloat16 # The index of the batch inputs batch_indices_cuda: Optional[torch.Tensor] = None + # Optional resource manager (used to access SA manager for EAGLE3+SA) + spec_resource_manager: Optional[Eagle3ResourceManager] = None def __post_init__(self): if self.layers_to_capture is None: @@ -345,6 +360,12 @@ def prepare(self): non_blocking=True) self.num_tokens -= (self.num_generations) * self.max_draft_len + sa_manager = getattr(self.spec_resource_manager, 'sa_manager', None) + if sa_manager is not None: + gen_request_ids = self.request_ids[num_seqs - self.num_generations:] + if gen_request_ids: + sa_manager.prepare(gen_request_ids, self.max_draft_len) + def maybe_capture_hidden_states( self, layer_id: int, @@ -375,6 +396,9 @@ def __init__(self, super().__init__(use_separate_draft_kv_cache) self.spec_config = spec_config self.mapping = mapping + self.sa_enhancer: Optional[SADraftEnhancer] = None + if getattr(spec_config, 'use_sa_spec', False): + self.sa_enhancer = SADraftEnhancer(spec_config.sa_spec_threshold) @property def max_draft_len(self) -> int: @@ -424,6 +448,19 @@ def forward(self, accepted_tokens, num_accepted_tokens = self.sample_and_accept_draft_tokens( logits, attn_metadata, spec_metadata) + sa_manager = getattr(spec_metadata.spec_resource_manager, 'sa_manager', + None) + if self.sa_enhancer is not None and sa_manager is not None: + self.sa_enhancer.extend_and_prepare( + sa_manager=sa_manager, + request_ids=spec_metadata.request_ids, + accepted_tokens=accepted_tokens, + num_accepted_tokens=num_accepted_tokens, + num_gens=num_gens, + num_contexts=num_contexts, + max_draft_len=self.max_draft_len, + ) + # Save the old attn_metadata and spec_metadata self._prepare_attn_metadata_for_spec_dec(attn_metadata) @@ -528,6 +565,14 @@ def forward(self, } next_draft_tokens = torch.stack(next_draft_tokens, dim=1) + # Override with SA draft tokens after all draft layers have run, + # so that draft layers never see SA tokens in their inputs. + if self.sa_enhancer is not None: + gen_draft_tokens = next_draft_tokens[num_contexts:] + gen_draft_tokens = self.sa_enhancer.maybe_override_all_draft_tokens( + gen_draft_tokens) + next_draft_tokens[num_contexts:] = gen_draft_tokens + # restore attn_metadata to support cuda graph self._restore_attn_metadata_from_spec_dec(attn_metadata) # restore all_rank_num_tokens for attention DP @@ -588,11 +633,10 @@ def draft_decoder( Draft token ids. Flattened. ''' - # Note: using greedy for draft tokens is a bit easier to implement and - # faster. It doesn't affect the final output and seems to have a negligible - # impact on AR. d2t = getattr(draft_model.model, "d2t", None) - return self._draft_sampler_greedy(logits, d2t) + draft_tokens = self._draft_sampler_greedy(logits, d2t) + + return draft_tokens def prepare_1st_drafter_inputs( self, diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py index f8f6ade06ea..67b31b9ca0f 100644 --- a/tensorrt_llm/_torch/speculative/mtp.py +++ b/tensorrt_llm/_torch/speculative/mtp.py @@ -18,8 +18,8 @@ from ..pyexecutor.sampler import TorchSampler from ..pyexecutor.scheduler import ScheduledRequests from .interface import SpecMetadata, SpecWorkerBase +from .sa_enhancer import SADraftEnhancer from .spec_sampler_base import SampleStateSpec, SpecSamplerBase -from .suffix_automaton import SuffixAutomatonManager if TYPE_CHECKING: from tensorrt_llm.llmapi.llm_args import MTPDecodingConfig @@ -128,8 +128,6 @@ class MTPSpecMetadata(SpecMetadata): # CUDA graph, we use this tensor to store the number of input tokens for the # subsequent draft forward. subseq_all_rank_num_tokens: Optional[List[int]] = None - # Optional suffix automaton manager for MTP+SA speculative decoding - sa_manager: Optional[SuffixAutomatonManager] = None def __post_init__(self) -> None: if self.mtp_hidden_states_manager is not None: @@ -221,12 +219,12 @@ def prepare(self): pin_memory=prefer_pinned()) self.slot_ids[:num_seqs].copy_(mtp_slot_ids, non_blocking=True) - # Prepare SA manager for MTP+SA path (copies pending states to GPU) - if self.sa_manager is not None: + sa_manager = getattr(self.mtp_hidden_states_manager, 'sa_manager', None) + if sa_manager is not None: num_contexts = num_seqs - self.num_generations gen_request_ids = self.request_ids[num_contexts:] if gen_request_ids: - self.sa_manager.prepare(gen_request_ids, self.max_draft_len) + sa_manager.prepare(gen_request_ids, self.max_draft_len) class MTPSampler(SpecSamplerBase): @@ -272,10 +270,9 @@ def __init__(self, self.spec_config = spec_config self.model_config = model_config self.is_thop = False - # Initialize SA spec attributes - self.sa_match_len = None - self.sa_draft_tokens = None - self.sa_spec_index = 0 + self.sa_enhancer: Optional[SADraftEnhancer] = None + if spec_config.use_sa_spec: + self.sa_enhancer = SADraftEnhancer(spec_config.sa_spec_threshold) @property def max_draft_len(self) -> int: @@ -468,6 +465,15 @@ def forward( } next_draft_tokens = torch.stack(next_draft_tokens, dim=1) + # Override with SA draft tokens after all MTP layers have run, + # so that MTP layers never see SA tokens in their inputs. + if self.sa_enhancer is not None: + num_contexts = attn_metadata.num_contexts + gen_draft_tokens = next_draft_tokens[num_contexts:] + gen_draft_tokens = self.sa_enhancer.maybe_override_all_draft_tokens( + gen_draft_tokens) + next_draft_tokens[num_contexts:] = gen_draft_tokens + # restore attn metadata if attn_metadata is not None: self._restore_attn_metadata_from_spec_dec(attn_metadata) @@ -834,30 +840,18 @@ def sample_and_accept_draft_tokens( logits, draft_tokens, num_contexts, batch_size, spec_metadata) - if self.spec_config.use_sa_spec and spec_metadata.sa_manager is not None: - - # Initialize the output buffers - self.sa_match_len = torch.zeros((num_gens, ), - dtype=torch.int32, - device="cuda") - self.sa_draft_tokens = torch.zeros((num_gens, mtp_num_modules), - dtype=torch.int32, - device="cuda") - - self.sa_spec_index = 0 - - # Invoke a batch update of the suffix automaton states - # and get the next suffix draft tokens - if num_gens > 0: - gen_request_ids = spec_metadata.request_ids[num_contexts:] - match_len, draft_tokens_sa = spec_metadata.sa_manager.extend( - gen_request_ids, - accepted_tokens[num_contexts:], - num_accepted_tokens[num_contexts:], - mtp_num_modules, - ) - self.sa_match_len.copy_(match_len) - self.sa_draft_tokens.copy_(draft_tokens_sa) + sa_manager = getattr(spec_metadata.mtp_hidden_states_manager, + 'sa_manager', None) + if self.sa_enhancer is not None and sa_manager is not None: + self.sa_enhancer.extend_and_prepare( + sa_manager=sa_manager, + request_ids=spec_metadata.request_ids, + accepted_tokens=accepted_tokens, + num_accepted_tokens=num_accepted_tokens, + num_gens=num_gens, + num_contexts=num_contexts, + max_draft_len=mtp_num_modules, + ) return accepted_tokens, num_accepted_tokens @@ -1109,19 +1103,6 @@ def draft_sampler( # Simple argmax if no TP or no model config draft_tokens = self._draft_sampler_greedy(logits) - # select between MTP draft tokens and SA draft tokens - # Check sa_match_len is not None to handle case where use_sa_spec is True - if self.spec_config.use_sa_spec and self.sa_match_len is not None and ( - num_gens := self.sa_match_len.shape[0]) > 0: - num_contexts = draft_tokens.shape[0] - num_gens - - draft_tokens[num_contexts:] = torch.where( - self.sa_match_len >= self.spec_config.sa_spec_threshold, - self.sa_draft_tokens[:, self.sa_spec_index], - draft_tokens[num_contexts:]) - - self.sa_spec_index += 1 - return draft_tokens @@ -1333,6 +1314,17 @@ def update_kv_lens(kv_lens_cuda, batch_size): self._restore_attn_metadata_from_spec_dec(attn_metadata) attn_metadata.use_spec_decoding = True + # Override with SA draft tokens after all MTP layers have run, + # so that MTP layers never see SA tokens in their inputs. + # Must happen before stacking since next_draft_tokens is still a list. + if self.sa_enhancer is not None: + stacked = torch.stack(next_draft_tokens, dim=1) + gen_draft_tokens = stacked[num_contexts:] + gen_draft_tokens = self.sa_enhancer.maybe_override_all_draft_tokens( + gen_draft_tokens) + stacked[num_contexts:] = gen_draft_tokens + next_draft_tokens = [stacked[:, i] for i in range(stacked.shape[1])] + next_draft_tokens, next_new_tokens = self._prepare_next_tokens( next_draft_tokens, accepted_tokens, spec_metadata, batch_size, num_accepted_tokens) diff --git a/tensorrt_llm/_torch/speculative/pard.py b/tensorrt_llm/_torch/speculative/pard.py index fa25627bc14..f4da0e6c9d2 100644 --- a/tensorrt_llm/_torch/speculative/pard.py +++ b/tensorrt_llm/_torch/speculative/pard.py @@ -9,7 +9,9 @@ from tensorrt_llm.mapping import Mapping from ..attention_backend import AttentionMetadata +from ..pyexecutor.resource_manager import BaseResourceManager from .interface import SpecMetadata, SpecWorkerBase +from .sa_enhancer import SADraftEnhancer if TYPE_CHECKING: from ...llmapi.llm_args import PARDDecodingConfig @@ -20,6 +22,8 @@ class PARDSpecMetadata(SpecMetadata): """Metadata for PARD speculative decoding.""" batch_indices_cuda: Optional[torch.Tensor] = None + # Optional resource manager (used to access SA manager for PARD+SA) + spec_resource_manager: Optional[BaseResourceManager] = None def __post_init__(self): self.batch_indices_cuda = torch.empty( @@ -40,6 +44,27 @@ def prepare(self): ) self.batch_indices_cuda[:num_seqs].copy_(batch_indices, non_blocking=True) + sa_manager = self._get_sa_manager() + if sa_manager is not None: + gen_request_ids = self.request_ids[num_seqs - self.num_generations :] + if gen_request_ids: + sa_manager.prepare(gen_request_ids, self.max_draft_len) + + def _get_sa_manager(self): + """Get SA manager from spec_resource_manager. + + For PARD+SA the resource manager IS the SuffixAutomatonManager, + while for other techniques it's accessed via a .sa_manager attribute. + """ + from .suffix_automaton import SuffixAutomatonManager + + rm = self.spec_resource_manager + if rm is None: + return None + if isinstance(rm, SuffixAutomatonManager): + return rm + return getattr(rm, "sa_manager", None) + class PARDWorker(SpecWorkerBase): """ @@ -63,6 +88,9 @@ def __init__( super().__init__(use_separate_draft_kv_cache) self.spec_config = spec_config self.mapping = mapping + self.sa_enhancer: Optional[SADraftEnhancer] = None + if getattr(spec_config, "use_sa_spec", False): + self.sa_enhancer = SADraftEnhancer(spec_config.sa_spec_threshold) logger.info( f"PARDWorker initialized with use_separate_draft_kv_cache={use_separate_draft_kv_cache}" ) @@ -193,6 +221,18 @@ def forward( ) accepted_tokens = torch.cat([accepted_tokens, acc_padding], dim=1) + sa_manager = spec_metadata._get_sa_manager() if self.sa_enhancer else None + if self.sa_enhancer is not None and sa_manager is not None: + self.sa_enhancer.extend_and_prepare( + sa_manager=sa_manager, + request_ids=spec_metadata.request_ids, + accepted_tokens=accepted_tokens, + num_accepted_tokens=num_accepted_tokens, + num_gens=num_gens, + num_contexts=num_contexts, + max_draft_len=K, + ) + self._prepare_attn_metadata_for_pard(attn_metadata, spec_metadata) self._prepare_kv_for_draft_forward( attn_metadata, num_accepted_tokens, num_contexts, batch_size @@ -252,6 +292,11 @@ def forward( gen_draft_tokens = gen_draft_tokens.type(torch.int32) + if self.sa_enhancer is not None and sa_manager is not None: + gen_draft_tokens = self.sa_enhancer.maybe_override_all_draft_tokens( + gen_draft_tokens + ) + # Pad from (num_gens, K) to (num_gens, 2K-1). if K > 1: pad = torch.zeros((num_gens, K - 1), dtype=torch.int32, device="cuda") diff --git a/tensorrt_llm/_torch/speculative/sa_enhancer.py b/tensorrt_llm/_torch/speculative/sa_enhancer.py new file mode 100644 index 00000000000..dec2c4bbe10 --- /dev/null +++ b/tensorrt_llm/_torch/speculative/sa_enhancer.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Composable SA (Suffix Automaton) draft enhancer for one-engine speculative decoding workers. + +When enabled, SA pattern matching overrides neural draft tokens for requests +where the suffix match length exceeds the configured threshold. +""" + +from typing import List, Optional + +import torch + +from .suffix_automaton import SuffixAutomatonManager + + +class SADraftEnhancer: + """Composable SA enhancement for any one-engine spec worker. + + This class encapsulates all SA-specific logic (extend, prepare buffers, + override draft tokens) so that any worker (MTP, EAGLE3, PARD, etc.) can + opt into SA enhancement. + + Usage: + 1. Construct once during worker ``__init__`` when ``use_sa_spec`` is True. + 2. Call ``extend_and_prepare`` after ``sample_and_accept_draft_tokens``. + 3. Call ``maybe_override_all_draft_tokens`` once after all draft layers + have finished, so that neural draft layers never see SA tokens. + """ + + def __init__(self, sa_spec_threshold: int): + self.sa_spec_threshold = sa_spec_threshold + self.sa_match_len: Optional[torch.Tensor] = None + self.sa_draft_tokens: Optional[torch.Tensor] = None + self.sa_spec_index: int = 0 + + def extend_and_prepare( + self, + sa_manager: SuffixAutomatonManager, + request_ids: List[int], + accepted_tokens: torch.Tensor, + num_accepted_tokens: torch.Tensor, + num_gens: int, + num_contexts: int, + max_draft_len: int, + ) -> None: + """Extend SA states with accepted tokens and prepare override buffers. + + Must be called after ``sample_and_accept_draft_tokens`` and before the + draft generation loop. + + Args: + sa_manager: The SuffixAutomatonManager instance. + request_ids: Full request ID list (contexts + generations). + accepted_tokens: [batch_size, padded_width] accepted tokens + (may be wider than max_draft_len + 1 due to caller padding). + num_accepted_tokens: [batch_size] number of accepted tokens. + num_gens: Number of generation requests in the batch. + num_contexts: Number of context requests in the batch. + max_draft_len: Number of draft positions to produce. + """ + self.sa_match_len = torch.zeros((num_gens,), dtype=torch.int32, device="cuda") + self.sa_draft_tokens = torch.zeros( + (num_gens, max_draft_len), dtype=torch.int32, device="cuda" + ) + self.sa_spec_index = 0 + + if num_gens > 0: + gen_request_ids = request_ids[num_contexts:] + # The CUDA kernel indexes accepted tokens as + # acceptedTokensIn[i * (draftLength + 1) + j] + # so the physical stride must equal max_draft_len + 1. + # Callers like PARD pad accepted_tokens to [batch, 2K]; the + # slice + .contiguous() below compacts memory so the stride + # matches the kernel expectation. + gen_accepted = accepted_tokens[num_contexts:, : max_draft_len + 1].contiguous() + match_len, draft_tokens_sa = sa_manager.extend( + gen_request_ids, + gen_accepted, + num_accepted_tokens[num_contexts:], + max_draft_len, + ) + self.sa_match_len.copy_(match_len) + self.sa_draft_tokens.copy_(draft_tokens_sa) + + def maybe_override_all_draft_tokens( + self, + draft_tokens: torch.Tensor, + ) -> torch.Tensor: + """Override all K draft positions at once. + + Used by all one-engine workers (MTP, EAGLE3, PARD) to override neural + draft tokens with SA tokens after the draft loop completes. + + Args: + draft_tokens: [num_gens, K] draft tokens from the neural drafter. + + Returns: + The (potentially overridden) draft tokens tensor. + """ + if self.sa_match_len is not None and self.sa_match_len.shape[0] > 0: + K = draft_tokens.shape[1] + mask = ( + (self.sa_match_len >= self.sa_spec_threshold).unsqueeze(1).expand_as(draft_tokens) + ) + draft_tokens = torch.where(mask, self.sa_draft_tokens[:, :K], draft_tokens) + + return draft_tokens diff --git a/tensorrt_llm/_torch/speculative/sa_worker.py b/tensorrt_llm/_torch/speculative/sa_worker.py index 5947f8ba4b3..7d7c2023121 100644 --- a/tensorrt_llm/_torch/speculative/sa_worker.py +++ b/tensorrt_llm/_torch/speculative/sa_worker.py @@ -81,7 +81,6 @@ def prepare(self) -> None: ) self.batch_indices_cuda[:num_seqs].copy_(batch_indices, non_blocking=True) - # Prepare SA manager (copies pending states to GPU) if self.sa_manager is not None: self.sa_manager.prepare(self.request_ids, self.max_draft_len) else: diff --git a/tensorrt_llm/_torch/speculative/suffix_automaton.py b/tensorrt_llm/_torch/speculative/suffix_automaton.py index df9d68d5f64..1c2ab067347 100644 --- a/tensorrt_llm/_torch/speculative/suffix_automaton.py +++ b/tensorrt_llm/_torch/speculative/suffix_automaton.py @@ -126,6 +126,11 @@ def __init__( # Track which requests have been initialized (for prepare_resources) self._initialized_requests: Set[int] = set() + # Reserved slot for CUDA graph dummy requests — shared by all dummies + # so they never consume slots from the real pool. + self._dummy_slot_index: int = max_num_requests + self._dummy_request_ids: Set[int] = set() + def _ensure_workspace(self, max_draft_len: int): """Ensure GPU workspace is allocated with sufficient capacity. @@ -146,9 +151,19 @@ def _ensure_workspace(self, max_draft_len: int): self._gpu_batch_indices = torch.zeros( (self.max_num_requests,), dtype=torch.int32, device="cuda" ) + # Mask: 1 for real requests, 0 for dummies. Populated by + # prepare() (outside CUDA graph) and used by extend() (inside + # CUDA graph) to zero out dummy entries without Python control + # flow that would break graph capture. + self._gpu_nondummy_mask = torch.ones( + (self.max_num_requests,), dtype=torch.int32, device="cuda" + ) - # Allocate GPU workspace for SA states with dynamic size - self._gpu_slots = _sa_native.allocate_workspace(self.max_num_requests, self.max_seq_len) + # Allocate one extra slot beyond max_num_requests for the shared + # CUDA graph dummy (slot index = max_num_requests). + self._gpu_slots = _sa_native.allocate_workspace( + self.max_num_requests + 1, self.max_seq_len + ) self._allocated_max_draft_len = max_draft_len self._workspace_allocated = True @@ -200,13 +215,17 @@ def remove_request(self, request_id: int): return slot = self._request_to_slot.pop(request_id) - self._free_slots.append(slot) + + if request_id in self._dummy_request_ids: + # Dummy slot is reserved; never return it to the free pool. + self._dummy_request_ids.discard(request_id) + else: + self._free_slots.append(slot) self._host_states_native.pop(request_id, None) self._pending_copies.discard(request_id) self._initialized_requests.discard(request_id) - # Clear the GPU slot if self._gpu_slots is not None: _sa_native.clear_slot(self._gpu_slots, slot, self.max_seq_len) @@ -235,24 +254,27 @@ def prepare(self, request_ids: List[int], max_draft_len: int): ) self._pending_copies.clear() - # Validate request_ids and prepare batch indices - # Do not use a default fallback - unknown request IDs would corrupt slot 0's state - unknown_rids = [rid for rid in request_ids if rid not in self._request_to_slot] - if unknown_rids: - raise KeyError( - f"SuffixAutomatonManager.prepare(): Unknown request IDs {unknown_rids}. " - f"All request IDs must be added via add_request() before calling prepare(). " - f"Known request IDs: {list(self._request_to_slot.keys())}" - ) - + # Map each request ID to its slot. Unknown IDs (e.g. CUDA graph + # warmup dummies that skipped the context phase) are routed to the + # reserved dummy slot so the kernel still runs on valid memory. + slots = [self._request_to_slot.get(rid, self._dummy_slot_index) for rid in request_ids] batch_indices = torch.tensor( - [self._request_to_slot[rid] for rid in request_ids], + slots, + dtype=torch.int32, + pin_memory=prefer_pinned(), + ) + # Build a non-dummy mask (1 = real, 0 = dummy) on CPU, then copy to + # the pre-allocated GPU buffer. extend() will use this mask via a + # simple element-wise multiply which is CUDA-graph-safe. + nondummy_mask = torch.tensor( + [0 if s == self._dummy_slot_index else 1 for s in slots], dtype=torch.int32, pin_memory=prefer_pinned(), ) num_requests = len(request_ids) self._gpu_batch_indices[:num_requests].copy_(batch_indices, non_blocking=True) + self._gpu_nondummy_mask[:num_requests].copy_(nondummy_mask, non_blocking=True) torch.cuda.synchronize() def extend( @@ -295,10 +317,16 @@ def extend( if num_accepted_tokens.dtype != torch.int32: num_accepted_tokens = num_accepted_tokens.to(torch.int32) + # Zero out accepted-token counts for dummy entries so the kernel's + # extend() loop is a no-op for them, avoiding the concurrent-write + # race when multiple dummies share one slot. The mask is populated + # by prepare() (outside CUDA graph); the multiply is graph-safe. + num_accepted_tokens = num_accepted_tokens * self._gpu_nondummy_mask[:batch_size] + _sa_native.invoke_extend( batch_size, max_draft_len, - self.max_num_requests, + self.max_num_requests + 1, self.max_seq_len, self._gpu_slots, self._gpu_batch_indices[:batch_size], @@ -351,11 +379,14 @@ def extend_ngram( if num_accepted_tokens.dtype != torch.int32: num_accepted_tokens = num_accepted_tokens.to(torch.int32) + # Zero out dummy entries (see extend() for rationale). + num_accepted_tokens = num_accepted_tokens * self._gpu_nondummy_mask[:batch_size] + _sa_native.invoke_extend_ngram( batch_size, max_draft_len, max_ngram_size, - self.max_num_requests, + self.max_num_requests + 1, self.max_seq_len, self._gpu_slots, self._gpu_batch_indices[:batch_size], @@ -387,9 +418,22 @@ def free_resources(self, request: LlmRequest): self.remove_request(request.request_id) def add_dummy_requests(self, request_ids: List[int]): - """Add dummy requests for CUDA graph warmup.""" + """Add dummy requests for CUDA graph padding. + + Dummy requests are mapped to a single reserved slot + (index = max_num_requests) that lives outside the real slot pool. + This prevents CUDA graph padding from exhausting slots that real + requests need. + + No host automaton is built -- the GPU slot is already zeroed by + allocate_workspace (at::zeros), so the kernel safely produces + match_len = 0 for dummies. + """ for rid in request_ids: - self.add_request(rid, [1]) # Dummy token + if rid in self._request_to_slot: + continue + self._request_to_slot[rid] = self._dummy_slot_index + self._dummy_request_ids.add(rid) def shutdown(self): """Clean up all resources.""" @@ -398,6 +442,7 @@ def shutdown(self): self._request_to_slot.clear() self._free_slots = list(range(self.max_num_requests)) + self._dummy_request_ids.clear() self._host_states_native.clear() self._pending_copies.clear() diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py index 17892bee8c3..a4ede54874d 100644 --- a/tensorrt_llm/_torch/speculative/utils.py +++ b/tensorrt_llm/_torch/speculative/utils.py @@ -35,11 +35,6 @@ def get_spec_metadata(spec_config, is_draft_model=False, max_seq_len=262144): if spec_config.spec_dec_mode.is_mtp_one_model(): - # Get SA manager from spec_resource_manager if MTP+SA mode - sa_manager = None - if spec_resource_manager is not None and hasattr( - spec_resource_manager, 'sa_manager'): - sa_manager = spec_resource_manager.sa_manager return MTPSpecMetadata( max_draft_len=spec_config.max_draft_len, max_total_draft_tokens=spec_config.tokens_per_gen_step - 1, @@ -47,7 +42,6 @@ def get_spec_metadata(spec_config, mtp_num_modules=spec_config.num_nextn_predict_layers, max_num_requests=max_num_requests, mtp_hidden_states_manager=spec_resource_manager, - sa_manager=sa_manager, allow_advanced_sampling=spec_config.allow_advanced_sampling, ) if spec_config.spec_dec_mode.is_mtp_eagle(): @@ -95,6 +89,7 @@ def get_spec_metadata(spec_config, max_num_tokens=max_num_tokens, layers_to_capture=spec_config.eagle3_layers_to_capture, allow_advanced_sampling=spec_config.allow_advanced_sampling, + spec_resource_manager=spec_resource_manager, ) if spec_config.spec_dec_mode.is_pard(): return PARDSpecMetadata( @@ -103,6 +98,7 @@ def get_spec_metadata(spec_config, spec_dec_mode=spec_config.spec_dec_mode, max_num_requests=max_num_requests, allow_advanced_sampling=spec_config.allow_advanced_sampling, + spec_resource_manager=spec_resource_manager, ) if spec_config.spec_dec_mode.is_draft_target_one_model(): return DraftTargetOneModelSpecMetadata( @@ -183,6 +179,22 @@ def get_spec_resource_manager(model_engine, draft_model_engine=None): max_num_requests, sa_manager=sa_manager, ) + if spec_dec_mode.is_eagle3_one_model(): + sa_manager = None + if getattr(spec_config, 'use_sa_spec', False): + sa_manager = SuffixAutomatonManager(spec_config, max_num_requests, + max_seq_len) + if sa_manager is not None: + return Eagle3ResourceManager( + spec_config, + model_config.torch_dtype, + model_config.hidden_size, + max_num_requests, + max_seq_len, + max_num_tokens, + sa_manager=sa_manager, + ) + return None if spec_dec_mode.is_eagle3() or spec_dec_mode.is_mtp_eagle(): assert draft_model_engine is not None, "Draft model engine is required for Eagle3 and MTP Eagle two model flow." return Eagle3ResourceManager( @@ -201,6 +213,11 @@ def get_spec_resource_manager(model_engine, draft_model_engine=None): max_num_requests, max_num_tokens, ) + if spec_dec_mode.is_pard(): + if getattr(spec_config, 'use_sa_spec', False): + return SuffixAutomatonManager(spec_config, max_num_requests, + max_seq_len) + return None if spec_dec_mode.is_ngram(): return NGramPoolManager(spec_config, max_num_requests) if spec_dec_mode.is_sa(): diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 0d86da2d786..91419488c46 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1025,6 +1025,17 @@ def is_linear_tree(self) -> bool: class Eagle3DecodingConfig(EagleDecodingConfig): decoding_type: Literal["Eagle3"] = "Eagle3" + # Suffix Automaton speculative decoding settings + use_sa_spec: Optional[bool] = Field( + default=False, + status="beta", + description="Combine with Suffix Automaton Decoding") + sa_spec_threshold: PositiveInt = Field( + default=4, + description="The threshold for the Suffix Automaton Decoding. If the" + " length of the suffix match exceeds the threshold, use" + " the suffix automaton output for the next draft tokens.") + class SaveHiddenStatesDecodingConfig(DecodingBaseConfig): decoding_type: Literal["SaveState"] = "SaveState" @@ -1251,7 +1262,7 @@ class MTPDecodingConfig(DecodingBaseConfig): default=False, status="beta", description="Combine with Suffix Automaton Decoding") - sa_spec_threshold: int = Field( + sa_spec_threshold: PositiveInt = Field( default=4, description="The threshold for the Suffix Automaton Decoding. If the" " length of the suffix match exceeds the threshold, use" @@ -1335,6 +1346,17 @@ class PARDDecodingConfig(DecodingBaseConfig): decoding_type: Literal["PARD"] = "PARD" + # Suffix Automaton speculative decoding settings + use_sa_spec: Optional[bool] = Field( + default=False, + status="beta", + description="Combine with Suffix Automaton Decoding") + sa_spec_threshold: PositiveInt = Field( + default=4, + description="The threshold for the Suffix Automaton Decoding. If the" + " length of the suffix match exceeds the threshold, use" + " the suffix automaton output for the next draft tokens.") + @model_validator(mode="after") def set_max_total_draft_tokens(self): self.max_total_draft_tokens = self.max_draft_len diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index a44adee5f29..fe50dc26f91 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -8,8 +8,14 @@ meta-llama/Llama-3.1-8B-Instruct: accuracy: 74.20 - spec_dec_algo: Eagle3 accuracy: 74.20 + - spec_dec_algo: Eagle3 + extra_acc_spec: use_sa_spec + accuracy: 74.20 - spec_dec_algo: PARD accuracy: 74.20 + - spec_dec_algo: PARD + extra_acc_spec: use_sa_spec + accuracy: 74.20 - quant_algo: FP8 accuracy: 74.30 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index c8a1da0ba47..7a312d7428e 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -311,6 +311,32 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model, task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_hopper + def test_eagle3_sa(self): + """Accuracy test for EAGLE3 One-Model + Suffix Automaton speculative decoding.""" + pytorch_config = dict( + max_batch_size=1, + disable_overlap_scheduler=False, + cuda_graph_config=CudaGraphConfig(max_batch_size=1, + enable_padding=True), + ) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) + + eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B" + target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" + + spec_config = Eagle3DecodingConfig(max_draft_len=4, + speculative_model=eagle_model_dir, + eagle3_one_model=True, + use_sa_spec=True) + + with LLM(model=target_model_dir, + **pytorch_config, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm, extra_acc_spec="use_sa_spec") + @skip_pre_hopper @parametrize_with_ids("overlap_scheduler", [True, False]) def test_pard(self, overlap_scheduler): @@ -341,6 +367,31 @@ def test_pard(self, overlap_scheduler): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_hopper + def test_pard_sa(self): + """Accuracy test for PARD + Suffix Automaton speculative decoding.""" + pytorch_config = dict( + max_batch_size=1, + disable_overlap_scheduler=False, + cuda_graph_config=CudaGraphConfig(max_batch_size=1, + enable_padding=True), + ) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) + + pard_model_dir = f"{llm_models_root()}/PARD-Llama-3.2-1B" + target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" + + spec_config = PARDDecodingConfig(max_draft_len=4, + speculative_model=pard_model_dir, + use_sa_spec=True) + + with LLM(model=target_model_dir, + **pytorch_config, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm, extra_acc_spec="use_sa_spec") + @skip_pre_hopper def test_ngram(self): max_bs = 16 @@ -1537,8 +1588,6 @@ def test_bfloat16_mtp_sa(self): speculative_config=mtp_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm, extra_acc_spec="use_sa_spec") - task = MMLU(self.MODEL_NAME) - task.evaluate(llm, extra_acc_spec="use_sa_spec") @pytest.mark.skip_less_device(4) @parametrize_with_ids("torch_compile", [False, True]) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index d95b41bb255..72762da3ad6 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -8,6 +8,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3_sa accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] @@ -35,6 +36,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard[overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_pard_sa accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype From 3b82b6cac02154eb7b6d9b7b9c8626d804fb0ff1 Mon Sep 17 00:00:00 2001 From: tburt-nv <195370667+tburt-nv@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:16:51 -0400 Subject: [PATCH 5/9] [None][chore] waive test_visual_gen_quickstart (#12043) Signed-off-by: Tyler Burt <195370667+tburt-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 22225fc32bd..5fa921dfb25 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -375,3 +375,4 @@ unittest/auto_deploy/multigpu/transformations/library/test_tp_sharding.py::test_ perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/4846166) perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] SKIP (https://nvbugs/5846166) full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5961814) +examples/test_visual_gen.py::test_visual_gen_quickstart SKIP (https://nvbugs/5963896) From 69de4a60e7db73177e30bb80f333e6a35091a3dd Mon Sep 17 00:00:00 2001 From: NVShreyas <158103197+NVShreyas@users.noreply.github.com> Date: Mon, 9 Mar 2026 10:17:47 -0700 Subject: [PATCH 6/9] [None][feat] NIXL support for hybrid model cache transfer (#11608) Signed-off-by: Shreyas Misra --- .../batch_manager/cacheTransceiver.h | 2 +- .../tensorrt_llm/executor/cacheCommunicator.h | 9 + .../batch_manager/baseTransBuffer.h | 10 ++ .../batch_manager/cacheFormatter.cpp | 15 +- .../batch_manager/cacheTransBuffer.cpp | 1 + .../batch_manager/cacheTransBuffer.h | 6 + .../batch_manager/cacheTransceiver.cpp | 42 +++-- .../batch_manager/cacheTransferLayer.cpp | 8 + .../batch_manager/dataTransceiver.cpp | 76 ++++++-- .../batch_manager/mlaCacheFormatter.cpp | 27 ++- .../batch_manager/rnnCacheFormatter.cpp | 30 +++- .../batch_manager/rnnCacheTransBuffer.h | 6 +- .../agent_utils/connection.cpp | 163 +++++++++++++----- .../agent_utils/connection.h | 57 ++++-- .../unit_tests/executor/agentCommTest.cpp | 4 +- .../executor/serializeUtilsTest.cpp | 45 ++++- .../multi_gpu/cacheTransceiverTest.cpp | 10 +- .../_torch/pyexecutor/kv_cache_transceiver.py | 13 +- .../_torch/pyexecutor/model_engine.py | 4 +- .../accuracy/test_disaggregated_serving.py | 22 ++- .../test_lists/qa/llm_function_core.txt | 2 + .../test_lists/test-db/l0_dgx_b200.yml | 2 + .../others/test_kv_cache_transceiver.py | 6 +- 23 files changed, 412 insertions(+), 148 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h index 9f870312838..8f833060389 100644 --- a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h +++ b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h @@ -288,7 +288,7 @@ class CacheTransceiver : public BaseCacheTransceiver std::unique_ptr mManager; std::optional mCacheTransceiverConfig; std::vector> mCacheTransBufferManagers; - std::vector mCacheTransBufferManagerPtrs; + std::vector mCacheTransBufferManagerPtrs; rnn_state_manager::RnnStateManager* mRnnStateManager{nullptr}; // TODO(shreyasm): update this to use same container as kv by using base trans buffers instead diff --git a/cpp/include/tensorrt_llm/executor/cacheCommunicator.h b/cpp/include/tensorrt_llm/executor/cacheCommunicator.h index 286be2988c2..52a7d3ca6d5 100644 --- a/cpp/include/tensorrt_llm/executor/cacheCommunicator.h +++ b/cpp/include/tensorrt_llm/executor/cacheCommunicator.h @@ -18,6 +18,8 @@ #include "tensorrt_llm/executor/serialization.h" #include +#include +#include #include namespace tensorrt_llm::executor::kv_cache @@ -63,6 +65,13 @@ class Connection { return false; } + + virtual void activateBuffer(uint8_t /*kind*/) const {} + + [[nodiscard]] virtual std::optional getPreAssignedBufferId(uint8_t /*kind*/) const + { + return std::nullopt; + } }; class ConnectionManager diff --git a/cpp/tensorrt_llm/batch_manager/baseTransBuffer.h b/cpp/tensorrt_llm/batch_manager/baseTransBuffer.h index ec311e5c400..1efeb89ccc0 100644 --- a/cpp/tensorrt_llm/batch_manager/baseTransBuffer.h +++ b/cpp/tensorrt_llm/batch_manager/baseTransBuffer.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,13 @@ class FabricMemory; namespace tensorrt_llm::batch_manager { +enum class BufferKind : uint8_t +{ + kKV = 0, + kKV_INDEXER = 1, + kRNN = 2 +}; + /// @brief Base class for cache transfer buffer management. /// Handles buffer pool allocation, index assignment, and slicing. /// Derived classes provide cache-specific size calculations. @@ -46,6 +54,8 @@ class BaseTransBufferManager public: virtual ~BaseTransBufferManager() = default; + [[nodiscard]] virtual BufferKind getBufferKind() const = 0; + /// @brief Assign a buffer index for sending. /// @return Assigned buffer index, or nullopt if using dynamic buffers. std::optional assignBufferIndexForSend(); diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp index ae2822faa53..0c91aa6860e 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp @@ -539,9 +539,9 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio "bufferCoverTargetNum:%d pickUpConnections.size():%ld", bufferTargetNum, targetNum, peerDuplicateHeadFactor, targetInfo.mDupHeadFactor, bufferCoverTargetNum, pickUpConnections.size()); - auto* agentConnnecion + auto const* agentConnection = dynamic_cast(connections[pickUpConnections[0]]); - if (agentConnnecion != nullptr) + if (agentConnection != nullptr) { TLLM_CHECK_WITH_INFO(bufferCoverTargetNum == bufferTargetNum, "Agent need all buffer pre-allocated"); TLLM_CHECK(onlyUseDynamicBuffer == false); @@ -792,12 +792,11 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess TLLM_CHECK(blockNum > 0); - auto* agentConnnecion - = dynamic_cast(connections[pickUpConnections[0]]); - if (agentConnnecion != nullptr) + auto preAssignedKvId + = connections[pickUpConnections[0]]->getPreAssignedBufferId(static_cast(BufferKind::kKV)); + if (preAssignedKvId.has_value()) { - cacheBufferId = agentConnnecion->getCacheBufferId(); - TLLM_CHECK(cacheBufferId.has_value()); + cacheBufferId = static_cast(*preAssignedKvId); } else { @@ -811,7 +810,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess bufferCoverTargetNum = bufferCoverTargetNumtmp; remainNoCoverTargetNum = targetNum > bufferCoverTargetNum ? targetNum - bufferCoverTargetNum : 0; - if (agentConnnecion != nullptr) + if (preAssignedKvId.has_value()) { TLLM_CHECK_WITH_INFO(bufferCoverTargetNum == targetNum, "Agent need buffer pre-allocated"); TLLM_CHECK(onlyUseDynamicBuffer == false); diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp index fca4419f22f..875e3c7e3be 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp @@ -249,6 +249,7 @@ CacheTransBufferManager::CacheTransBufferManager( : cacheManager->getPrimaryPool(0)->getDataType(), maxNumTokens) , mCacheManager{cacheManager} + , mTransferIndexerKCache{transferIndexerKCache} { // TODO: FP4 dataSize TLLM_CHECK(mCacheManager); diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h b/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h index 56607acf23b..b63f18ab797 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h +++ b/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h @@ -74,12 +74,18 @@ class CacheTransBufferManager : public BaseTransBufferManager return mCacheManager; } + [[nodiscard]] BufferKind getBufferKind() const override + { + return mTransferIndexerKCache ? BufferKind::kKV_INDEXER : BufferKind::kKV; + } + private: /// @brief Compute transfer buffer size from KV cache configuration. static size_t computeTransferBufferSize(KVCacheManager::BaseKVCacheManager* cacheManager, std::optional maxNumTokens, bool transferIndexerKCache); KVCacheManager::BaseKVCacheManager* mCacheManager; + bool mTransferIndexerKCache; }; } // namespace tensorrt_llm::batch_manager::kv_cache_manager diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp index a59022edbf2..2e4bf1f0666 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @@ -185,12 +185,6 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa mCacheTransBufferManagers.push_back( std::make_unique(cacheManager, maxNumTokens, true)); } - mCacheTransBufferManagerPtrs.clear(); - mCacheTransBufferManagerPtrs.reserve(mCacheTransBufferManagers.size()); - for (auto& manager : mCacheTransBufferManagers) - { - mCacheTransBufferManagerPtrs.push_back(manager.get()); - } // RNN specific setup if (mRnnStateManager != nullptr) @@ -198,13 +192,6 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa TLLM_LOG_DEBUG("Setting up RNN cache transfer components."); TLLM_CHECK(!rnnLayerNumPerPP.empty()); - if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL - || backendType.value() == executor::CacheTransceiverConfig::BackendType::MOONCAKE) - { - TLLM_LOG_ERROR("RNN cache transfer is not supported for NIXL and MOONCAKE yet"); - return; - } - mRnnCacheTransBufferManager = std::make_unique(mRnnStateManager, maxNumTokens); @@ -218,6 +205,17 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa TLLM_LOG_INFO("RNN cache transfer components initialized."); } + mCacheTransBufferManagerPtrs.clear(); + mCacheTransBufferManagerPtrs.reserve(mCacheTransBufferManagers.size() + (mRnnCacheTransBufferManager ? 1 : 0)); + for (auto& manager : mCacheTransBufferManagers) + { + mCacheTransBufferManagerPtrs.push_back(manager.get()); + } + if (mRnnCacheTransBufferManager) + { + mCacheTransBufferManagerPtrs.push_back(mRnnCacheTransBufferManager.get()); + } + if (backendType.value() == executor::CacheTransceiverConfig::BackendType::UCX) { std::lock_guard lock(mDllMutex); @@ -239,14 +237,18 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa } else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL) { + auto rnnState + = mCacheState->hasRnnConfig() ? std::make_optional(mCacheState->getRnnCacheState()) : std::nullopt; mManager = std::make_unique( - mCacheTransBufferManagerPtrs, *mCacheState, "nixl"); + mCacheTransBufferManagerPtrs, *mCacheState, "nixl", rnnState); TLLM_LOG_INFO("NIXL Connection Manager created"); } else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MOONCAKE) { + auto rnnState + = mCacheState->hasRnnConfig() ? std::make_optional(mCacheState->getRnnCacheState()) : std::nullopt; mManager = std::make_unique( - mCacheTransBufferManagerPtrs, *mCacheState, "mooncake"); + mCacheTransBufferManagerPtrs, *mCacheState, "mooncake", rnnState); TLLM_LOG_INFO("MOONCAKE Connection Manager created"); } else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MPI) @@ -261,7 +263,15 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa } auto makeFormatter = [cacheManager, isMLA, this]() - { return createCacheFormatter(cacheManager, mCacheTransBufferManagerPtrs, isMLA); }; + { + std::vector kvBufferPtrs; + kvBufferPtrs.reserve(mCacheTransBufferManagers.size()); + for (auto& mgr : mCacheTransBufferManagers) + { + kvBufferPtrs.push_back(mgr.get()); + } + return createCacheFormatter(cacheManager, kvBufferPtrs, isMLA); + }; auto makeRnnFormatter = [this]() -> std::unique_ptr { diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransferLayer.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransferLayer.cpp index 4c74565df22..7d49bfa9545 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransferLayer.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransferLayer.cpp @@ -21,6 +21,7 @@ #include "tensorrt_llm/batch_manager/rnnCacheFormatter.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/executor/cache_transmission/agent_utils/connection.h" #include "tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h" #include @@ -95,6 +96,13 @@ void CacheTransferLayer::format(TransferSession& session) const mKvFormatter->format(session); if (mRnnFormatter) { + for (auto const* conn : session.getConnections()) + { + if (conn != nullptr) + { + conn->activateBuffer(static_cast(BufferKind::kRNN)); + } + } mRnnFormatter->format(session); } } diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp index e4d46f900f9..a18308c2f0e 100644 --- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp @@ -26,6 +26,7 @@ #include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/utils.h" #include "tensorrt_llm/executor/cache_transmission/agent_utils/connection.h" +#include "tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/utils/mpiUtils.h" #include @@ -384,9 +385,10 @@ class CacheSender::Impl auto allCounterparts = mCacheTransferLayer.computeCounterparts( mSelfState.getCommState().value().getSelfIdx(), info.getTransState()); - auto peerSelfIdx = info.getTransState().getCommState()->getSelfIdx(); // Index of self in peer's comm state + auto peerSelfIdx = info.getTransState().getCommState()->getSelfIdx(); int peerIdx = std::distance( allCounterparts.begin(), std::find(allCounterparts.begin(), allCounterparts.end(), peerSelfIdx)); + TLLM_CHECK_WITH_INFO(peerIdx < static_cast(allCounterparts.size()), "Peer rank %d not found in expected counterparts", peerSelfIdx); { @@ -861,6 +863,19 @@ class CacheReceiver::Impl auto allCounterparts = mCacheTransferLayer.computeCounterparts(mSelfState.getCommState().value().getSelfIdx(), contextState); + auto kvCounterParts = mCacheTransferLayer.getKvFormatter()->getCounterparts( + mCacheTransferLayer.getCacheState(), mSelfState.getCommState().value().getSelfIdx(), destCacheState); + + bool hasRnn = mCacheTransferLayer.getCacheState().hasRnnConfig() && destCacheState.hasRnnConfig(); + + std::vector rnnCounterParts; + if (hasRnn) + { + rnnCounterParts = executor::kv_cache::targetIRanksForRnn( + destCacheState, mCacheTransferLayer.getCacheState(), mSelfState.getCommState().value().getSelfIdx()) + .mIRanks; + } + auto connections = mManager->getConnections(commState); std::vector allConnections; for (auto index : allCounterparts) @@ -869,24 +884,59 @@ class CacheReceiver::Impl allConnections.emplace_back(connection); } - for (size_t i = 0; i < allConnections.size(); i++) + for (size_t ci = 0; ci < allCounterparts.size(); ci++) { - auto const* connection = allConnections[i]; - // if Manager is agentConnectionManager, then send request info to agent - auto* agentConnectionManager = dynamic_cast(mManager); + auto rank = allCounterparts[ci]; + auto const* connection = connections.at(rank); + + bool isKvCounterpart + = std::find(kvCounterParts.begin(), kvCounterParts.end(), rank) != kvCounterParts.end(); + bool isRnnCounterpart + = hasRnn && std::find(rnnCounterParts.begin(), rnnCounterParts.end(), rank) != rnnCounterParts.end(); + if (agentConnectionManager) { - // TODO: index -> validConnectionIdx conversion - // TODO(shreyasm): this will not work for RNN. Will error out in the constructor if used with RNN. - auto [pickUpIdx, localRankIdx] = mCacheTransferLayer.getKvFormatter()->pickRecvConnections( - allCounterparts.size(), mSelfState.getCacheState().value(), - mSelfState.getCommState().value().getSelfIdx(), destCacheState, allCounterparts); - auto validConnectionIdx = std::find(localRankIdx.begin(), localRankIdx.end(), i) - localRankIdx.begin(); + auto idsForRank = cacheBufferIds; + auto const& managers = agentConnectionManager->getCacheTransBufferManagers(); + for (size_t i = 0; i < idsForRank.size(); i++) + { + auto kind = managers[i]->getBufferKind(); + bool include = (kind != BufferKind::kRNN) ? isKvCounterpart : isRnnCounterpart; + if (!include) + { + idsForRank[i] = std::nullopt; + } + } + + int validConnectionIdx = 0; + if (isKvCounterpart) + { + auto kvCpIdx + = std::find(kvCounterParts.begin(), kvCounterParts.end(), rank) - kvCounterParts.begin(); + auto [pickUpIdx, localRankIdx] = mCacheTransferLayer.getKvFormatter()->pickRecvConnections( + allCounterparts.size(), mSelfState.getCacheState().value(), + mSelfState.getCommState().value().getSelfIdx(), destCacheState, allCounterparts); + validConnectionIdx + = std::find(localRankIdx.begin(), localRankIdx.end(), kvCpIdx) - localRankIdx.begin(); + } + else if (isRnnCounterpart) + { + auto rnnTargetInfo = executor::kv_cache::targetIRanksForRnn(destCacheState, + mCacheTransferLayer.getCacheState(), mSelfState.getCommState().value().getSelfIdx()); + auto rnnCpIdx + = std::find(rnnCounterParts.begin(), rnnCounterParts.end(), rank) - rnnCounterParts.begin(); + auto [pickUpIdx, localRankIdx] = cache_formatter_utils::pickRecvConnections(rnnCounterParts.size(), + mCacheTransferLayer.getCacheState(), mSelfState.getCommState().value().getSelfIdx(), + destCacheState, rnnCounterParts, rnnTargetInfo); + validConnectionIdx + = std::find(localRankIdx.begin(), localRankIdx.end(), rnnCpIdx) - localRankIdx.begin(); + } + auto* agentConnection = dynamic_cast(connection); TLLM_CHECK(agentConnection != nullptr); - TLLM_CHECK(!cacheBufferIds.empty()); + const_cast(agentConnection) - ->sendRequestAndBufferInfo(requestInfo, cacheBufferIds, validConnectionIdx); + ->sendRequestAndBufferInfo(requestInfo, idsForRank, validConnectionIdx); } else { diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp index 9cedd091920..c72090867f2 100644 --- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp @@ -168,16 +168,12 @@ void MLACacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& ses for (auto transferIndexerKCache : transferringIndexerKCache) { - auto activeBufferIdx = transferIndexerKCache ? 1UL : 0UL; + auto bufferKind = transferIndexerKCache ? static_cast(BufferKind::kKV_INDEXER) + : static_cast(BufferKind::kKV); for (size_t i = 0; i < pickUpConnections.size(); i++) { auto const* connection = connections.at(pickUpConnections[i]); - if (auto const* agentConnection = dynamic_cast(connection)) - { - TLLM_CHECK(agentConnection->getSenderBufferCount() > activeBufferIdx); - const_cast(agentConnection) - ->setActiveSenderBufferIdx(activeBufferIdx); - } + connection->activateBuffer(bufferKind); } int blockNum = 0; std::vector inputKvCacheBlocks; @@ -263,9 +259,9 @@ void MLACacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& ses auto& outputSplitCaches = std::get<0>(result); auto& bufferCoverTargetNum = std::get<1>(result); auto& onlyUseDynamicBuffer = std::get<2>(result); - auto* agentConnnecion + auto const* agentConnection = dynamic_cast(connections[pickUpConnections[0]]); - if (agentConnnecion != nullptr) + if (agentConnection != nullptr) { TLLM_CHECK_WITH_INFO( bufferCoverTargetNum == pPDomainSize * cPDomainSize, "Agent need all buffer pre-allocated"); @@ -488,13 +484,12 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s } else { - auto* agentConnnecion - = dynamic_cast(connections[pickUpConnections[0]]); - size_t activeBufferIdx = transferIndexerKCache ? 1 : 0; - if (agentConnnecion != nullptr) + auto bufferKind = transferIndexerKCache ? static_cast(BufferKind::kKV_INDEXER) + : static_cast(BufferKind::kKV); + auto preAssignedId = connections[pickUpConnections[0]]->getPreAssignedBufferId(bufferKind); + if (preAssignedId.has_value()) { - cacheBufferId = agentConnnecion->getCacheBufferId(activeBufferIdx); - TLLM_CHECK(cacheBufferId.has_value()); + cacheBufferId = static_cast(*preAssignedId); } else { @@ -530,7 +525,7 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s auto& bufferCoverTargetNum = std::get<1>(result); size_t remainNoCoverTargetNum = targetNum > bufferCoverTargetNum ? targetNum - bufferCoverTargetNum : 0; auto& onlyUseDynamicBuffer = std::get<2>(result); - if (agentConnnecion != nullptr) + if (preAssignedId.has_value()) { TLLM_CHECK_WITH_INFO(bufferCoverTargetNum == targetNum, "Agent need buffer pre-allocated"); TLLM_CHECK(onlyUseDynamicBuffer == false); diff --git a/cpp/tensorrt_llm/batch_manager/rnnCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/rnnCacheFormatter.cpp index 18c9ed2e09c..1fd1cbdc253 100644 --- a/cpp/tensorrt_llm/batch_manager/rnnCacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/rnnCacheFormatter.cpp @@ -22,6 +22,7 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/nvtxUtils.h" +#include "tensorrt_llm/executor/cache_transmission/agent_utils/connection.h" #include "tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h" #include @@ -156,6 +157,14 @@ void RnnCacheFormatter::format(TransferSession& session) TLLM_CHECK(cacheBufferId.has_value() || onlyUseDynamicBuffer); + auto const* agentConnection + = dynamic_cast(connections[pickUpConnections[0]]); + if (agentConnection != nullptr) + { + TLLM_CHECK_WITH_INFO(bufferCoverTargetNum == bufferTargetNum, "Agent needs all RNN send buffers pre-allocated"); + TLLM_CHECK(onlyUseDynamicBuffer == false); + } + std::vector inputConvBlocks; std::vector inputSsmBlocks; @@ -302,7 +311,19 @@ void RnnCacheFormatter::unformat(TransferSession& session) // Allocate receive buffers size_t remainNoCoverSourceNum = 0; size_t bufferCoverSourceNum = 0; - auto cacheBufferId = mRnnCacheTransBufferManager->assignBufferIndexForRecv(); + std::optional cacheBufferId = std::nullopt; + + auto preAssignedRnnId + = connections[pickUpConnections[0]]->getPreAssignedBufferId(static_cast(BufferKind::kRNN)); + if (preAssignedRnnId.has_value()) + { + cacheBufferId = static_cast(*preAssignedRnnId); + } + else + { + cacheBufferId = mRnnCacheTransBufferManager->assignBufferIndexForRecv(); + } + auto allocationResult = mRnnCacheTransBufferManager->getOrAllocateRecvBuffers( cacheBufferId, static_cast(sourceNum), bufferSizesPerSource, bufferManager); auto& recvBuffers = std::get<0>(allocationResult); @@ -310,6 +331,13 @@ void RnnCacheFormatter::unformat(TransferSession& session) auto& onlyUseDynamicBuffer = std::get<2>(allocationResult); TLLM_CHECK(cacheBufferId.has_value() || onlyUseDynamicBuffer); + + if (preAssignedRnnId.has_value()) + { + TLLM_CHECK_WITH_INFO(bufferCoverSourceNumTmp == sourceNum, "Agent needs all RNN recv buffers pre-allocated"); + TLLM_CHECK(onlyUseDynamicBuffer == false); + } + bufferCoverSourceNum = bufferCoverSourceNumTmp; remainNoCoverSourceNum = sourceNum > bufferCoverSourceNum ? sourceNum - bufferCoverSourceNum : 0; diff --git a/cpp/tensorrt_llm/batch_manager/rnnCacheTransBuffer.h b/cpp/tensorrt_llm/batch_manager/rnnCacheTransBuffer.h index e6df47bce86..f510a14f787 100644 --- a/cpp/tensorrt_llm/batch_manager/rnnCacheTransBuffer.h +++ b/cpp/tensorrt_llm/batch_manager/rnnCacheTransBuffer.h @@ -55,8 +55,10 @@ class RnnCacheTransBufferManager : public BaseTransBufferManager return mRnnStateManager; } - /// @brief set dtypes - // void setDtypes(RnnCacheState const& cacheState) noexcept; + [[nodiscard]] BufferKind getBufferKind() const override + { + return BufferKind::kRNN; + } private: /// @brief Compute transfer buffer size from RNN state configuration. diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp index e9ada7abb4a..d46defdf50a 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp @@ -54,8 +54,9 @@ std::string genUniqueAgentName() // layer num, since the buffer size is ratio is equal to the layer num ratio // except the VSWA case. +template auto computeSendOffsetRatio( - CacheState const& peerCacheState, int peerIdx, CacheState const& selfCacheState, int connectionIdx) + CacheStateT const& peerCacheState, int peerIdx, CacheStateT const& selfCacheState, int connectionIdx) { auto peerTargetInfo = targetIRanks(selfCacheState, peerCacheState, peerIdx); size_t offsetLayer = 0; @@ -80,22 +81,6 @@ AgentConnection::AgentConnection( TLLM_CHECK(!mCacheTransBufferManagers.empty()); } -std::optional AgentConnection::getCacheBufferId(size_t bufferIdx) const -{ - TLLM_CHECK(bufferIdx < mCacheBufferIds.size()); - return mCacheBufferIds[bufferIdx]; -} - -size_t AgentConnection::getSenderBufferCount() const -{ - return mSenderState.mCacheReceiverBufferDescs.size(); -} - -void AgentConnection::setActiveSenderBufferIdx(size_t bufferIdx) -{ - mSenderState.setActiveBufferIdx(bufferIdx); -} - MemoryDesc const& AgentConnection::SenderState::activeBufferDesc() const { TLLM_CHECK(!mCacheReceiverBufferDescs.empty()); @@ -103,7 +88,14 @@ MemoryDesc const& AgentConnection::SenderState::activeBufferDesc() const return mCacheReceiverBufferDescs[mActiveBufferIdx]; } -void AgentConnection::SenderState::setActiveBufferIdx(size_t bufferIdx) +std::pair const& AgentConnection::SenderState::activeOffsetRatio() const +{ + TLLM_CHECK(!mOffsetRatios.empty()); + TLLM_CHECK(mActiveBufferIdx < mOffsetRatios.size()); + return mOffsetRatios[mActiveBufferIdx]; +} + +void AgentConnection::SenderState::setActiveBufferIdx(size_t bufferIdx) const { TLLM_CHECK(bufferIdx < mCacheReceiverBufferDescs.size()); mActiveBufferIdx = bufferIdx; @@ -139,7 +131,8 @@ void AgentConnection::send(DataContext const& ctx, void const* data, size_t size reinterpret_cast(data), size, static_cast(mAgentConnectionManager->getDeviceId())}; MemoryDescs srcDescs{MemoryType::kVRAM, {srcDesc}}; auto const& dstBaseDesc = mSenderState.activeBufferDesc(); - auto offset = size / mSenderState.mOffsetRatio.second * mSenderState.mOffsetRatio.first; + auto const& offsetRatio = mSenderState.activeOffsetRatio(); + auto offset = size / offsetRatio.second * offsetRatio.first; MemoryDesc dstDesc{dstBaseDesc.getAddr() + offset, size, dstBaseDesc.getDeviceId()}; TLLM_LOG_DEBUG( "send dstDesc: %p, size: %ld ,validSegmentIdx: %ld", dstDesc.getAddr(), size, mSenderState.validSegmentIdx); @@ -169,27 +162,38 @@ void AgentConnection::sendRequestAndBufferInfo(batch_manager::RequestInfo& reque TLLM_CHECK(!common::getEnvTryZCopyForKVCacheTransfer()); TLLM_CHECK(!cacheBufferIds.empty()); - TLLM_CHECK(cacheBufferIds.size() == mCacheTransBufferManagers.size()); - auto preAllocateBuffers = std::vector(); - preAllocateBuffers.reserve(cacheBufferIds.size()); + TLLM_CHECK(cacheBufferIds.size() <= mCacheTransBufferManagers.size()); + + auto const& allKinds = mAgentConnectionManager->getBufferKinds(); + std::vector preAllocateBuffers; std::vector bufferDescs; - bufferDescs.reserve(cacheBufferIds.size()); + std::vector> activeCacheBufferIds; + std::vector activeKinds; + for (size_t i = 0; i < cacheBufferIds.size(); i++) { - TLLM_CHECK(cacheBufferIds[i].has_value()); + if (!cacheBufferIds[i].has_value()) + { + continue; + } auto preAllocateBuffer = mCacheTransBufferManagers[i]->getRecvBuffer(cacheBufferIds[i].value()); - preAllocateBuffers.push_back(preAllocateBuffer); TLLM_CHECK(preAllocateBuffer != nullptr); + preAllocateBuffers.push_back(preAllocateBuffer); + activeCacheBufferIds.push_back(cacheBufferIds[i]); + activeKinds.push_back(allKinds[i]); } - mCacheBufferIds = cacheBufferIds; + TLLM_CHECK(!activeCacheBufferIds.empty()); + + mCacheBufferIds = std::move(activeCacheBufferIds); + mBufferKinds = activeKinds; + int deviceId = -1; TLLM_CUDA_CHECK(cudaGetDevice(&deviceId)); TLLM_CHECK(deviceId != -1); TLLM_CHECK(deviceId == mAgentConnectionManager->getDeviceId()); - for (size_t i = 0; i < preAllocateBuffers.size(); i++) + for (auto const& buf : preAllocateBuffers) { - bufferDescs.emplace_back(reinterpret_cast(preAllocateBuffers[i]->data()), - preAllocateBuffers[i]->getSizeInBytes(), deviceId); + bufferDescs.emplace_back(reinterpret_cast(buf->data()), buf->getSizeInBytes(), deviceId); } std::string address = mAgentConnectionManager->getAgent()->getLocalConnectionInfo(); std::optional metadataOpt = std::nullopt; @@ -201,21 +205,24 @@ void AgentConnection::sendRequestAndBufferInfo(batch_manager::RequestInfo& reque } RequestAndBufferInfo requestAndBufferInfo{ - mAgentName, address, requestInfo, bufferDescs, metadataOpt, connectionIdx}; + mAgentName, address, requestInfo, bufferDescs, metadataOpt, connectionIdx, activeKinds}; std::stringstream ss; NotificationInfo notificationInfo{requestAndBufferInfo}; NotificationInfo::serialize(notificationInfo, ss); mAgentConnectionManager->getAgent()->notifySyncMessage(mRemoteAgentName, ss.str()); } -void AgentConnection::setSenderState( - std::vector cacheReceiverBufferDescs, int validSegmentIdx, std::pair offsetRatio) +void AgentConnection::setSenderState(std::vector cacheReceiverBufferDescs, int validSegmentIdx, + std::vector> offsetRatios, std::vector bufferKinds) { TLLM_CHECK(!cacheReceiverBufferDescs.empty()); + TLLM_CHECK(offsetRatios.size() == cacheReceiverBufferDescs.size()); + TLLM_CHECK(bufferKinds.size() == cacheReceiverBufferDescs.size()); mSenderState.mCacheReceiverBufferDescs = std::move(cacheReceiverBufferDescs); mSenderState.validSegmentIdx = validSegmentIdx; - mSenderState.mOffsetRatio = offsetRatio; + mSenderState.mOffsetRatios = std::move(offsetRatios); mSenderState.setActiveBufferIdx(0); + mBufferKinds = std::move(bufferKinds); } void AgentConnection::setHasLoadRemoteAgent(bool hasLoadRemoteAgent) @@ -244,10 +251,35 @@ bool AgentConnection::recvReadySignal(DataContext const& ctx) const return readySignalInfo.mIsReady; } +void AgentConnection::activateBuffer(uint8_t kind) const +{ + for (size_t i = 0; i < mBufferKinds.size(); i++) + { + if (mBufferKinds[i] == kind) + { + mSenderState.setActiveBufferIdx(i); + return; + } + } +} + +std::optional AgentConnection::getPreAssignedBufferId(uint8_t kind) const +{ + for (size_t i = 0; i < mBufferKinds.size(); i++) + { + if (mBufferKinds[i] == kind && i < mCacheBufferIds.size()) + { + return mCacheBufferIds[i]; + } + } + return std::nullopt; +} + AgentConnectionManager::AgentConnectionManager( - std::vector cacheTransBufferManagers, - CacheState cacheState, std::string const& backendType) + std::vector cacheTransBufferManagers, CacheState cacheState, + std::string const& backendType, std::optional rnnCacheState) : mCacheState(std::move(cacheState)) + , mRnnCacheState(std::move(rnnCacheState)) , mCacheTransBufferManagers(std::move(cacheTransBufferManagers)) , mRegMemDescs(MemoryType::kVRAM, {}) { @@ -259,10 +291,12 @@ AgentConnectionManager::AgentConnectionManager( BaseAgentConfig config{mAgentName, true, false, true}; m_Agent = makeTransferAgent(backendType, &config); TLLM_CHECK(!mCacheTransBufferManagers.empty()); + mBufferKinds.reserve(mCacheTransBufferManagers.size()); std::vector memDescs; for (auto* cacheTransBufferManager : mCacheTransBufferManagers) { TLLM_CHECK(cacheTransBufferManager != nullptr); + mBufferKinds.push_back(static_cast(cacheTransBufferManager->getBufferKind())); auto recvBufferCount = cacheTransBufferManager->getRecvBufferCount(); auto sendBufferCount = cacheTransBufferManager->getSendBufferCount(); for (size_t i = 0; i < recvBufferCount; i++) @@ -359,10 +393,53 @@ AgentConnection const* AgentConnectionManager::recvConnectionAndRequestInfo( auto remoteAgentName = requestAndBufferInfo.mAgentName; TLLM_LOG_DEBUG(" recv Address:%s", address.c_str()); auto connection = connect(remoteAgentName, address, metadataOpt, true); - // to compute the offset. - auto offsetRatio = computeSendOffsetRatio(requestInfo.getTransState().getCacheState().value(), - requestInfo.getTransState().getCommState()->getSelfIdx(), mCacheState, connectionIdx); - connection->setSenderState(std::move(bufferDescs), connectionIdx, offsetRatio); + auto bufferKinds = std::move(requestAndBufferInfo.mBufferKinds); + + std::optional> kvOffsetRatio; + std::optional> rnnOffsetRatio; + std::vector> offsetRatios; + offsetRatios.reserve(bufferDescs.size()); + + for (size_t bi = 0; bi < bufferDescs.size(); bi++) + { + auto kind = static_cast(bufferKinds[bi]); + switch (kind) + { + case batch_manager::BufferKind::kKV: + case batch_manager::BufferKind::kKV_INDEXER: + { + if (!kvOffsetRatio) + { + kvOffsetRatio + = computeSendOffsetRatio(requestInfo.getTransState().getCacheState().value(), + requestInfo.getTransState().getCommState()->getSelfIdx(), mCacheState, + connectionIdx); + } + offsetRatios.push_back(*kvOffsetRatio); + break; + } + case batch_manager::BufferKind::kRNN: + { + if (!rnnOffsetRatio) + { + auto rnnTargetInfo = targetIRanksForRnn(mCacheState, + requestInfo.getTransState().getCacheState().value(), + requestInfo.getTransState().getCommState()->getSelfIdx()); + size_t rnnOffsetLayer = 0; + for (int ri = 0; ri < connectionIdx; ri++) + { + rnnOffsetLayer += rnnTargetInfo.getPeerPPDomainLayerNum(ri); + } + size_t rnnSendLayer = rnnTargetInfo.getPeerPPDomainLayerNum(connectionIdx); + rnnOffsetRatio = std::make_pair(rnnOffsetLayer, rnnSendLayer); + } + offsetRatios.push_back(*rnnOffsetRatio); + break; + } + } + } + connection->setSenderState( + std::move(bufferDescs), connectionIdx, std::move(offsetRatios), std::move(bufferKinds)); notifIt = notifs.erase(notifIt); if (notifs.empty()) { @@ -421,12 +498,16 @@ BaseTransferAgent* AgentConnectionManager::getAgent() const return m_Agent.get(); } -std::vector const& -AgentConnectionManager::getCacheTransBufferManagers() const +std::vector const& AgentConnectionManager::getCacheTransBufferManagers() const { return mCacheTransBufferManagers; } +std::vector const& AgentConnectionManager::getBufferKinds() const +{ + return mBufferKinds; +} + AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentName, std::string const& connectionInfo, std::optional metadata, bool isSender) { diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h index a0478e3bd98..8ec948cfafe 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h @@ -17,7 +17,7 @@ #pragma once -#include "tensorrt_llm/batch_manager/cacheTransBuffer.h" +#include "tensorrt_llm/batch_manager/baseTransBuffer.h" #include "tensorrt_llm/batch_manager/dataTransceiver.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -43,6 +43,7 @@ struct RequestAndBufferInfo std::vector mBufferDescs; std::optional mMetadata; int mValidConnectionIdx; + std::vector mBufferKinds; static void serialize(RequestAndBufferInfo const& requestAndBufferInfo, std::ostream& os) { @@ -57,6 +58,11 @@ struct RequestAndBufferInfo } su::serialize(requestAndBufferInfo.mMetadata, os); su::serialize(requestAndBufferInfo.mValidConnectionIdx, os); + su::serialize(requestAndBufferInfo.mBufferKinds.size(), os); + for (auto kind : requestAndBufferInfo.mBufferKinds) + { + su::serialize(kind, os); + } } static RequestAndBufferInfo deserialize(std::istream& is) @@ -74,7 +80,15 @@ struct RequestAndBufferInfo } auto metadata = su::deserialize(is); auto validConnectionIdx = su::deserialize(is); - return RequestAndBufferInfo{agentName, address, requestInfo, bufferDescs, metadata, validConnectionIdx}; + auto bufferKindsSize = su::deserialize(is); + std::vector bufferKinds; + bufferKinds.reserve(bufferKindsSize); + for (size_t i = 0; i < bufferKindsSize; i++) + { + bufferKinds.push_back(su::deserialize(is)); + } + return RequestAndBufferInfo{ + agentName, address, requestInfo, bufferDescs, metadata, validConnectionIdx, bufferKinds}; } static size_t serializedSize(RequestAndBufferInfo const& requestAndBufferInfo) @@ -91,6 +105,8 @@ struct RequestAndBufferInfo } totalSize += su::serializedSize(requestAndBufferInfo.mMetadata); totalSize += su::serializedSize(requestAndBufferInfo.mValidConnectionIdx); + totalSize += su::serializedSize(requestAndBufferInfo.mBufferKinds.size()); + totalSize += requestAndBufferInfo.mBufferKinds.size() * su::serializedSize(uint8_t{}); return totalSize; } }; @@ -245,16 +261,16 @@ class AgentConnection : public Connection void recv(DataContext const& ctx, void* data, size_t size) const override; void sendRequestAndBufferInfo(batch_manager::RequestInfo& requestInfo, std::vector> const& cacheBufferIds, int validConnectionIdx); - void setSenderState( - std::vector cacheReceiverBufferDescs, int valideSegmentIdx, std::pair offsetRatio); - void setActiveSenderBufferIdx(size_t bufferIdx); - [[nodiscard]] size_t getSenderBufferCount() const; - [[nodiscard]] std::optional getCacheBufferId(size_t bufferIdx = 0) const; + void setSenderState(std::vector cacheReceiverBufferDescs, int valideSegmentIdx, + std::vector> offsetRatios, std::vector bufferKinds); void setHasLoadRemoteAgent(bool hasLoadRemoteAgent); [[nodiscard]] bool hasLoadRemoteAgent() const; void sendReadySignal(DataContext const& ctx, bool isReady) const; bool recvReadySignal(DataContext const& ctx) const; + void activateBuffer(uint8_t kind) const override; + [[nodiscard]] std::optional getPreAssignedBufferId(uint8_t kind) const override; + private: std::string mAgentName; std::string mRemoteAgentName; @@ -263,18 +279,21 @@ class AgentConnection : public Connection { std::vector mCacheReceiverBufferDescs; int validSegmentIdx{0}; - std::pair mOffsetRatio{0, 1}; - size_t mActiveBufferIdx{0}; + /// Per-buffer offset ratios. Index corresponds to mCacheReceiverBufferDescs / mActiveBufferIdx. + std::vector> mOffsetRatios; + mutable size_t mActiveBufferIdx{0}; [[nodiscard]] MemoryDesc const& activeBufferDesc() const; - void setActiveBufferIdx(size_t bufferIdx); + [[nodiscard]] std::pair const& activeOffsetRatio() const; + void setActiveBufferIdx(size_t bufferIdx) const; SenderState() = default; }; AgentConnectionManager* mAgentConnectionManager; - std::vector const& mCacheTransBufferManagers; + std::vector const& mCacheTransBufferManagers; std::vector> mCacheBufferIds; - SenderState mSenderState; + std::vector mBufferKinds; + mutable SenderState mSenderState; bool mNeedSendMetadata{true}; bool mHasLoadRemoteAgent{false}; }; @@ -282,17 +301,17 @@ class AgentConnection : public Connection class AgentConnectionManager : public ConnectionManager { public: - AgentConnectionManager( - std::vector cacheTransBufferManagers, - CacheState cacheState, std::string const& backendType); + AgentConnectionManager(std::vector cacheTransBufferManagers, + CacheState cacheState, std::string const& backendType, + std::optional rnnCacheState = std::nullopt); ~AgentConnectionManager(); AgentConnection* recvConnect(DataContext const& ctx, void* data, size_t size) override; [[nodiscard]] std::vector getConnections(CommState const& state) override; [[nodiscard]] CommState const& getCommState() const override; AgentConnection const* recvConnectionAndRequestInfo( batch_manager::RequestInfo& requestInfo, std::atomic const& terminateFlag); - [[nodiscard]] std::vector const& - getCacheTransBufferManagers() const; + [[nodiscard]] std::vector const& getCacheTransBufferManagers() const; + [[nodiscard]] std::vector const& getBufferKinds() const; void updateUnhandledNotifications(); [[nodiscard]] BaseTransferAgent* getAgent() const; AgentConnection* connect(std::string const& remoteAgentName, std::string const& address, @@ -314,7 +333,9 @@ class AgentConnectionManager : public ConnectionManager std::mutex mConnectionsMutex; CommState mCommState; CacheState mCacheState; - std::vector mCacheTransBufferManagers; + std::optional mRnnCacheState; + std::vector mCacheTransBufferManagers; + std::vector mBufferKinds; std::mutex mNotificationMutex; std::unordered_map> mUnhandledNotifications; std::unique_ptr m_Agent; diff --git a/cpp/tests/unit_tests/executor/agentCommTest.cpp b/cpp/tests/unit_tests/executor/agentCommTest.cpp index 1eebbaacc06..d72d2fac6f9 100644 --- a/cpp/tests/unit_tests/executor/agentCommTest.cpp +++ b/cpp/tests/unit_tests/executor/agentCommTest.cpp @@ -155,7 +155,7 @@ class AgentCommTest : public ::testing::TestWithParam TEST_P(AgentCommTest, AgentConnectionManagerBasic) { - std::vector bufferManagers{mTransBufferManager.get()}; + std::vector bufferManagers{mTransBufferManager.get()}; auto connectionManager = std::make_unique(bufferManagers, *mCacheState, backend); ASSERT_TRUE(connectionManager != nullptr); ASSERT_EQ(connectionManager->getCacheTransBufferManagers().size(), bufferManagers.size()); @@ -170,7 +170,7 @@ TEST_P(AgentCommTest, AgentConnectionManagerBasic) TEST_P(AgentCommTest, AgentConnectionManagerConnect) { - std::vector bufferManagers{mTransBufferManager.get()}; + std::vector bufferManagers{mTransBufferManager.get()}; auto connectionManager0 = std::make_unique(bufferManagers, *mCacheState, backend); auto connectionManager1 = std::make_unique(bufferManagers, *mCacheState, backend); auto agentName0 = connectionManager0->getAgentName(); diff --git a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp index d0e1222535f..fb0fbb57f38 100644 --- a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp +++ b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp @@ -1258,12 +1258,12 @@ T serializeDeserializeNotification(T const& val) TEST(SerializeUtilsTest, RequestAndBufferInfo) { - // Test with all fields populated + // Test with all fields populated including bufferKinds { kv_cache::RequestAndBufferInfo original{"testAgent", "127.0.0.1:8080", tensorrt_llm::batch_manager::RequestInfo{}, std::vector{kv_cache::MemoryDesc{nullptr, 1024, 0}}, - std::make_optional("metadata"), 1}; + std::make_optional("metadata"), 1, {0, 2}}; auto deserialized = serializeDeserializeNotification(original); @@ -1276,13 +1276,14 @@ TEST(SerializeUtilsTest, RequestAndBufferInfo) EXPECT_EQ(original.mBufferDescs[0].getDeviceId(), deserialized.mBufferDescs[0].getDeviceId()); EXPECT_EQ(original.mMetadata, deserialized.mMetadata); EXPECT_EQ(original.mValidConnectionIdx, deserialized.mValidConnectionIdx); + EXPECT_EQ(original.mBufferKinds, deserialized.mBufferKinds); } - // Test with nullopt metadata + // Test with nullopt metadata and empty bufferKinds { kv_cache::RequestAndBufferInfo original{"testAgent2", "192.168.1.1:9090", tensorrt_llm::batch_manager::RequestInfo{}, - std::vector{kv_cache::MemoryDesc{nullptr, 512, 0}}, std::nullopt, 2}; + std::vector{kv_cache::MemoryDesc{nullptr, 512, 0}}, std::nullopt, 2, {}}; auto deserialized = serializeDeserializeNotification(original); @@ -1295,6 +1296,26 @@ TEST(SerializeUtilsTest, RequestAndBufferInfo) EXPECT_EQ(original.mBufferDescs[0].getDeviceId(), deserialized.mBufferDescs[0].getDeviceId()); EXPECT_EQ(original.mMetadata, deserialized.mMetadata); EXPECT_EQ(original.mValidConnectionIdx, deserialized.mValidConnectionIdx); + EXPECT_EQ(original.mBufferKinds, deserialized.mBufferKinds); + EXPECT_TRUE(deserialized.mBufferKinds.empty()); + } + + // Test with all three buffer kinds (KV + IndexerK + RNN) + { + kv_cache::RequestAndBufferInfo original{"testAgent3", "10.0.0.1:7070", + tensorrt_llm::batch_manager::RequestInfo{}, + std::vector{kv_cache::MemoryDesc{nullptr, 256, 0}, + kv_cache::MemoryDesc{nullptr, 256, 0}, kv_cache::MemoryDesc{nullptr, 128, 0}}, + std::make_optional("hybrid_metadata"), 3, {0, 1, 2}}; + + auto deserialized = serializeDeserializeNotification(original); + + ASSERT_EQ(original.mBufferDescs.size(), deserialized.mBufferDescs.size()); + ASSERT_EQ(original.mBufferKinds.size(), deserialized.mBufferKinds.size()); + EXPECT_EQ(original.mBufferKinds, deserialized.mBufferKinds); + EXPECT_EQ(deserialized.mBufferKinds[0], 0); + EXPECT_EQ(deserialized.mBufferKinds[1], 1); + EXPECT_EQ(deserialized.mBufferKinds[2], 2); } } @@ -1374,7 +1395,7 @@ TEST(SerializeUtilsTest, NotificationInfo) kv_cache::RequestAndBufferInfo requestInfo{"testAgent", "127.0.0.1:8080", tensorrt_llm::batch_manager::RequestInfo{}, std::vector{kv_cache::MemoryDesc{nullptr, 1024, 0}}, - std::make_optional("test_metadata"), 1}; + std::make_optional("test_metadata"), 1, {0, 2}}; kv_cache::NotificationInfo original{requestInfo}; auto deserialized = serializeDeserializeNotification(original); @@ -1386,6 +1407,7 @@ TEST(SerializeUtilsTest, NotificationInfo) EXPECT_EQ(requestInfo.mRequestInfo.getRequestId(), deserializedRequestInfo.mRequestInfo.getRequestId()); EXPECT_EQ(requestInfo.mMetadata, deserializedRequestInfo.mMetadata); EXPECT_EQ(requestInfo.mValidConnectionIdx, deserializedRequestInfo.mValidConnectionIdx); + EXPECT_EQ(requestInfo.mBufferKinds, deserializedRequestInfo.mBufferKinds); } // Test with NotificationSyncInfo variant @@ -1416,6 +1438,19 @@ TEST(SerializeUtilsTest, NotificationInfo) } } +TEST(SerializeUtilsTest, BufferKindEnumValues) +{ + using tensorrt_llm::batch_manager::BufferKind; + + EXPECT_EQ(static_cast(BufferKind::kKV), 0); + EXPECT_EQ(static_cast(BufferKind::kKV_INDEXER), 1); + EXPECT_EQ(static_cast(BufferKind::kRNN), 2); + + EXPECT_EQ(static_cast(uint8_t{0}), BufferKind::kKV); + EXPECT_EQ(static_cast(uint8_t{1}), BufferKind::kKV_INDEXER); + EXPECT_EQ(static_cast(uint8_t{2}), BufferKind::kRNN); +} + TEST(SerializeUtilsTest, CacheStateIndexerKCache) { using texec::kv_cache::CacheState; diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp index eefeb5ed7d3..9acc8236dd0 100644 --- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp @@ -767,13 +767,17 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam(bufferManagers, *mCacheState, "nixl"); + std::vector baseBufferManagers( + bufferManagers.begin(), bufferManagers.end()); + mConnectionManager = std::make_unique( + baseBufferManagers, *mCacheState, "nixl"); } else if (isMooncake) { + std::vector baseBufferManagers( + bufferManagers.begin(), bufferManagers.end()); mConnectionManager = std::make_unique( - bufferManagers, *mCacheState, "mooncake"); + baseBufferManagers, *mCacheState, "mooncake"); } else { diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py index 8023a634267..065bc20b57e 100644 --- a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py +++ b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py @@ -44,10 +44,7 @@ def create_kv_cache_transceiver( if cache_transceiver_config.backend == "DEFAULT": # When cache_transceiver_config.backend is not set, fallback to env_vars settings # NIXL is the default backend for non hybrid models - if mamba_cache_manager is None: - cache_transceiver_config.backend = "NIXL" - else: - cache_transceiver_config.backend = "UCX" + cache_transceiver_config.backend = "NIXL" # Ordered by priority env_vars = [ ("TRTLLM_USE_NIXL_KVCACHE", "NIXL"), @@ -72,14 +69,6 @@ def create_kv_cache_transceiver( f"UCX_CUDA_IPC_ENABLE_MNNVL=n, UCX_RNDV_SCHEME=put_zcopy and/or unset UCX_NET_DEVICES upon server " f"hangs or lower-than-expected performance.") - if mamba_cache_manager is not None and cache_transceiver_config.backend in [ - "NIXL", "MOONCAKE" - ]: - raise ValueError( - "NIXL or MOONCAKE backend does not support hybrid models with RNN (Mamba) states. " - "Please use UCX or MPI backend for cache transfer with hybrid models." - ) - # Select transceiver implementation based on transceiver_runtime # transceiver_runtime == None or "CPP" -> use C++ transceiver (default) # transceiver_runtime == "PYTHON" -> use Python transceiver diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index c2cfe9c68f3..d35191aaab2 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -63,6 +63,7 @@ from .guided_decoder import CapturableGuidedDecoder from .layerwise_nvtx_marker import LayerwiseNvtxMarker from .llm_request import LlmRequest, get_draft_token_length +from .mamba_cache_manager import MambaHybridCacheManager from .model_loader import ModelLoader, _construct_checkpoint_loader from .resource_manager import (BaseResourceManager, KVCacheManager, KVCacheManagerV2, PeftCacheManager, @@ -677,7 +678,8 @@ def warmup(self, resource_manager: ResourceManager) -> None: self._run_autotuner_warmup(resource_manager) self._run_cuda_graph_warmup(resource_manager) if not self.is_draft_model and not self.mapping.has_cp_helix( - ) and self.guided_decoder is None: + ) and self.guided_decoder is None and not isinstance( + kv_cache_manager, MambaHybridCacheManager): # Run extra general warmup to warmup memory pool before running real requests to reduce memory fragmentation. self._general_warmup(resource_manager, reverse=True) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 75ff4cbf89c..2e3ad4f1bb3 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -1694,13 +1694,12 @@ class TestNemotron3Super120B(LlmapiAccuracyTestHarness): MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-012726" MODEL_PATH = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-FP8-FP8KV-012726" - @pytest.mark.skip_less_device(8) - def test_auto_dtype(self): + def _make_configs(self, backend: str): ctx_server_config = { "max_batch_size": 32, "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "UCX", + "backend": backend, "max_tokens_in_buffer": 8192, }, "tensor_parallel_size": 4, @@ -1719,7 +1718,7 @@ def test_auto_dtype(self): "max_batch_size": 32, "disable_overlap_scheduler": False, "cache_transceiver_config": { - "backend": "UCX", + "backend": backend, "max_tokens_in_buffer": 8192, }, "tensor_parallel_size": 2, @@ -1752,7 +1751,18 @@ def test_auto_dtype(self): "urls": ["localhost:8002"] } } - with launch_disaggregated_llm(disaggregated_server_config, - ctx_server_config, gen_server_config, + return ctx_server_config, gen_server_config, disaggregated_server_config + + @pytest.mark.skip_less_device(8) + def test_auto_dtype(self): + ctx_cfg, gen_cfg, disagg_cfg = self._make_configs("UCX") + with launch_disaggregated_llm(disagg_cfg, ctx_cfg, gen_cfg, + self.MODEL_PATH) as llm: + run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"]) + + @pytest.mark.skip_less_device(8) + def test_nixl_backend(self): + ctx_cfg, gen_cfg, disagg_cfg = self._make_configs("NIXL") + with launch_disaggregated_llm(disagg_cfg, ctx_cfg, gen_cfg, self.MODEL_PATH) as llm: run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"]) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 72762da3ad6..8ae1369bfba 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -403,6 +403,8 @@ accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True-True] accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-False] accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4 +accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_auto_dtype +accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_nixl_backend # e2e test test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 094408ae1b4..1549cf1a4f0 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -135,6 +135,8 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_on-trtllm] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_on-cutlass] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_parallelism[TP4_PP2] TIMEOUT (60) + - accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_auto_dtype TIMEOUT (60) + - accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_nixl_backend TIMEOUT (60) - condition: ranges: system_gpu_count: diff --git a/tests/unittest/others/test_kv_cache_transceiver.py b/tests/unittest/others/test_kv_cache_transceiver.py index 87e07edcfb2..6960b1e28d0 100644 --- a/tests/unittest/others/test_kv_cache_transceiver.py +++ b/tests/unittest/others/test_kv_cache_transceiver.py @@ -334,7 +334,7 @@ def hybrid_dtypes(request): @pytest.mark.timeout(120) -@pytest.mark.parametrize("backend", ["UCX"], ids=["UCX"]) +@pytest.mark.parametrize("backend", ["NIXL", "UCX"], ids=["NIXL", "UCX"]) @pytest.mark.parametrize( "hybrid_dtypes", [ @@ -450,7 +450,7 @@ def test_hybrid_cache_transceiver_single_process(backend, hybrid_dtypes, @pytest.mark.timeout(120) -@pytest.mark.parametrize("backend", ["UCX"], ids=["UCX"]) +@pytest.mark.parametrize("backend", ["NIXL", "UCX"], ids=["NIXL", "UCX"]) def test_hybrid_cache_transceiver_cancel_request(backend, monkeypatch): monkeypatch.setenv("TRTLLM_USE_CPP_MAMBA", "1") @@ -460,7 +460,7 @@ def test_hybrid_cache_transceiver_cancel_request(backend, monkeypatch): hybrid_cache_manager_ctx = create_hybrid_cache_manager(mapping, dtype) hybrid_cache_manager_gen = create_hybrid_cache_manager(mapping, dtype) - cache_transceiver_config = CacheTransceiverConfig(backend="DEFAULT", + cache_transceiver_config = CacheTransceiverConfig(backend=backend, max_tokens_in_buffer=512) dist = Distributed.get(mapping) From 7747f255b826a0905700190f83e024e8422459d7 Mon Sep 17 00:00:00 2001 From: tcherckez-nvidia <127761168+tcherckez-nvidia@users.noreply.github.com> Date: Mon, 9 Mar 2026 19:58:21 +0200 Subject: [PATCH 7/9] [None][feat] Add Auto-Deploy dashboard failures analysis skill (#12033) Signed-off-by: Tal Cherckez <127761168+tcherckez-nvidia@users.noreply.github.com> --- .../skills/ad-pipeline-failure-pr/SKILL.md | 317 ++++++++++++++++++ 1 file changed, 317 insertions(+) create mode 100644 .claude/skills/ad-pipeline-failure-pr/SKILL.md diff --git a/.claude/skills/ad-pipeline-failure-pr/SKILL.md b/.claude/skills/ad-pipeline-failure-pr/SKILL.md new file mode 100644 index 00000000000..093eb1c30ce --- /dev/null +++ b/.claude/skills/ad-pipeline-failure-pr/SKILL.md @@ -0,0 +1,317 @@ +--- +name: ad-pipeline-failure-pr +description: Analyze the latest AutoDeploy pipeline or a user-specified pipeline ID, inspect failed job logs, group similar failures into actionable root-cause buckets, and create at most one PR per bucket. Use when the user mentions pipeline IDs, failed jobs, GitLab logs, failure buckets, or opening PRs from CI failures. +--- + +# Pipeline Failure PR + +**Input:** latest AutoDeploy `model-coverage` GitLab pipeline, or a specific upstream/downstream pipeline ID / pipeline URL. **Auth requirement:** the user must export a GitLab token in `GITLAB_TOKEN` before this skill can query pipelines, jobs, or traces. **Output:** first ask the user which output format is preferred. Default to reporting in chat. Alternative outputs are a Markdown report (`md`) and a per-failure CSV (`csv`). The skill still produces a bucketed failure report plus at most one PR per actionable root-cause bucket, and when a PR is not justified but the bucket is still worth tracking, create one issue for that bucket. + +## Core Rule + +This skill must be standalone. Resolve pipelines, failed jobs, and raw logs directly from GitLab APIs and job traces. Do **not** depend on `autodeploy-dashboard` code, scripts, CSVs, or its legacy categorization logic. This skill owns the bucketing rules, skip rules, repo ownership decision, and one-PR-per-bucket behavior. + +Before any GitLab API call, require `GITLAB_TOKEN` to be set in the environment. If it is missing, stop immediately and tell the user: `Set GITLAB_TOKEN to a GitLab personal access token and rerun this skill.` + +Before doing the main analysis, ask the user which output is preferred: +- `chat` (default) +- `md` +- `csv` + +If the user does not specify, default to `chat`. + +## Phase 0 — Resolve Scope + +1. Default scope is `model-coverage`. Do not silently switch to benchmark pipelines. +2. If the user explicitly asks to analyze a benchmark pipeline, stop and tell them this skill does not support benchmark pipelines. +3. If the user gives a pipeline ID or GitLab pipeline URL, use it. +4. Treat a user-provided pipeline as potentially either: + - an upstream AutoDeploy pipeline in `ftp/infra/autodeploy-dashboard` + - a downstream triggered pipeline in `dl/jet/ci` +5. If the starting pipeline is upstream, follow the failed bridge chain until you reach the first downstream pipeline with terminal `model-coverage` jobs. +6. Otherwise resolve the latest upstream AutoDeploy pipeline that ran `model-coverage`, then follow the same bridge chain to the terminal pipeline. +7. If `GITLAB_TOKEN` is missing, stop immediately and tell the user exactly how to fix it: `Set GITLAB_TOKEN to a GitLab personal access token and rerun this skill.` + +## Pipeline Resolution Rules + +Use this resolution order: +1. Identify whether the provided pipeline belongs to the upstream dashboard project or the downstream `dl/jet/ci` project. +2. If it is upstream, inspect its bridge jobs and select the failed `model-coverage` trigger path. +3. If the next pipeline contains only bridge jobs, keep following the failed trigger chain. +4. Stop at the first downstream pipeline that contains terminal failed `model-coverage` jobs with traces. +5. Report both: + - the user-facing starting pipeline + - the terminal pipeline that contains the actual failing jobs + +Do not analyze only the bridge failure if a deeper downstream pipeline contains the real job traces. + +All GitLab API and trace-fetching steps in this skill must authenticate with the token from `GITLAB_TOKEN`. + +## Phase 1 — Gather Failure Evidence + +For each failed job, collect: +- pipeline ID +- job ID and job URL +- raw log URL +- workload name +- model or benchmark configuration +- first causal error snippet from the raw trace + +Also collect: +- starting pipeline ID +- terminal pipeline ID +- whether the job came from a bridge-followed downstream path + +Before proposing a fix, read at least one representative raw log for every tentative bucket. Do not rely on legacy labels alone. + +Trace-reading rules: +- In `model-coverage` terminal pipelines, jobs often come in triplets like `[1 logs_before]`, `[2 ]`, `[3 logs_after]`. The primary failing workload is usually the `[2 ...]` job. Use `[1]` and `[3]` only as supplemental evidence when needed. +- If the trace ends with generic wrapper failures such as `RuntimeError: Executor worker returned error`, `RuntimeError: Executor worker died during initialization`, or `ERROR: Job failed: Process exited with status 1`, keep scanning upward and record the earlier model-, export-, tokenizer-, or environment-specific exception instead. +- Prefer the first specific exception that explains the failure over later fallout from worker teardown, Slurm cleanup, or proxy startup. +- When the workload dumps its config in the trace, capture the resolved `model:` value and relevant `yaml_extra`/runtime hints. They are often useful for explaining why a bucket is multimodal, world-size-specific, or using a special mode. + +## Skill-Owned Bucket Rules + +Every analyzed failed job must end up in exactly one bucket. Do **not** leave failures in an implicit catch-all like `other`, `misc`, or `untriaged` in the final report. + +This includes infra and external cases. They still need explicit buckets, for example: +- `infra/resource/oom` +- `infra/runtime/timeout-or-freeze` +- `infra/runtime/cancelled` +- `infra/filesystem/hf-lock-permission` +- `external/huggingface/access-forbidden` +- `external/huggingface/missing-revision` +- `external/huggingface/invalid-tokenizer-or-processor` +- `external/env/missing-python-package` +- `external/transformers/api-mismatch` + +Do **not** assume `oom` or `timeout-or-freeze` are infra-only. In AutoDeploy pipelines they often reflect real `TensorRT-LLM` / AutoDeploy bugs. Classify them as `infra/...` only when the evidence points to cluster noise or a non-code resource problem. Otherwise bucket them under the real owning repo/component. + +Group failures together only when all of these are true: +- they point to the same likely code owner and target repo +- they share the same causal failure signature, such as the same failing symbol, op, assertion, stack frame, or config path +- they appear fixable by one coherent code change +- one PR can reasonably explain why the same fix covers every matched job + +Split failures into different buckets when any of these are true: +- the first causal error differs even if the legacy category matches +- the same symptom comes from different repos or subsystems +- one failure is infrastructure noise and the other is a code bug +- the likely fixes would touch unrelated files or require different validation +- the evidence is mixed or contradictory + +When uncertain, split instead of merge. + +If a failed job does not fit any existing bucket, put it in its own one-job bucket. +Do not leave it uncategorized. + +That one-job bucket must still be labeled as exactly one of: +- `actionable` — likely fixable with a PR +- `issue-only` — worth tracking, but not ready for a PR + +Do not use a `skip PR` label. If a bucket should not produce a PR, mark it `issue-only` when it is still worth tracking. + +Buckets such as OOM, timeout/freeze, cancelled, or Hugging Face access failures must still appear explicitly in the report. If the shared failure mode is clear enough to track, prefer `issue-only`. + +The final report must account for **all** failed jobs: +- include the total failed job count +- include bucket counts +- ensure the sum of all bucket sizes equals the total failed job count +- make unmatched or low-confidence cases explicit as singleton buckets instead of hiding them + +Use this evidence priority order when bucketing: +1. first causal stack frame or assertion +2. explicit failing symbol, op, layer, config key, or script +3. repeated error snippet near the first failure +4. repeated failure wording across matched traces +5. job naming and workload metadata only as a weak tie-breaker + +Each bucket must have: +- a short bucket name in the form `repo/component/failure-mode` +- one representative job +- a list of all matching jobs +- one root-cause hypothesis tied to code + +## Skip Rules + +Do **not** create a PR for a bucket when any of these are true: +- the failures are pure infrastructure noise such as timeout, preemption, cluster cancellation, or log-access failure without code evidence +- the jobs do not share one plausible code fix +- the evidence is too weak to point at a concrete code path +- the issue belongs to external infrastructure or an external dependency outside the checked-out repos +- an open PR already appears to address the same bucket +- the only commonality is a broad status label or superficial wording + +If the starting pipeline failed only because a bridge failed, do not treat the bridge as its own actionable bucket unless the downstream terminal pipeline has no failing jobs or no accessible traces. + +Infrastructure and external buckets must still be reported as explicit buckets. They should usually be `issue-only` rather than promoted to a PR unless the evidence clearly points to a repo-owned fix. + +Common `issue-only` patterns seen in AutoDeploy model-coverage pipelines: +- gated or forbidden Hugging Face repos (`403`) +- missing or renamed Hugging Face revisions/models (`404`) +- missing optional Python packages such as `timm`, `num2words`, `mamba_ssm`, `causal_conv1d`, or similar runtime dependencies +- filesystem permission problems on Hugging Face cache lock files +- only clearly non-code resource failures after log review; do not auto-classify CUDA OOM or timeout/freeze as infra without checking for an AutoDeploy root cause + +## Repo Ownership Rules + +Prefer `TensorRT-LLM` when the root cause is in: +- AutoDeploy model code +- AutoDeploy runtime or transforms +- tests, configs, or execution paths owned by `TensorRT-LLM` +- code paths surfaced by `ad-debug-agent` + +Prefer `autodeploy-dashboard` when the root cause is in: +- failure-analysis scripts +- workload generation +- job URL or raw-log resolution +- pipeline orchestration or reporting gaps in the AutoDeploy pipeline repo + +Do not open a PR when the bucket belongs to cluster infrastructure, GitLab service behavior, or another external system that is not owned by the checked-out repos. + +## Phase 2 — Validate Each Bucket + +For every bucket: +1. Read the representative job log and isolate the first causal failure, not the downstream fallout. +2. Read the relevant code, config, or script that the failure points to. +3. Confirm that the same hypothesis explains the other jobs in the bucket. +4. If deeper AutoDeploy tracing is needed, use the `ad-debug-agent` workflow to inspect the failing code path before editing. +5. If the representative log does not actually support the bucket hypothesis, split or discard the bucket. + +Do not start coding until the bucket has both: +- one representative log snippet +- one code-level hypothesis + +## Phase 3 — Create At Most One Fix Per Bucket + +Work one bucket at a time. + +For an actionable bucket: +1. Choose the smallest code change that plausibly fixes the shared root cause. +2. Prefer a targeted fix over a broad cleanup. +3. Verify with the smallest relevant test or validation step. +4. If the validation suggests the bucket actually contains multiple root causes, split it before opening any PRs. +5. Create one branch and one PR for the full bucket. + +Never open one PR per failed job when the jobs share the same fix. + +## Phase 3b — Create One Issue When No PR Is Available + +If a bucket is worth tracking, but you do **not** have enough confidence for a PR, create one issue for that bucket instead of silently stopping. + +Create an issue when all of these are true: +- the bucket has a clear shared failure mode +- the representative logs provide enough evidence to explain the bucket +- one issue can clearly describe the shared failure mode +- a PR is not justified yet because the fix is uncertain, risky, mixed, under-validated, external, or infra-related + +Do **not** create an issue when any of these are true: +- the evidence is too weak to explain the failure mode at all +- an open issue or PR already appears to cover the same bucket +- the bucket is just a duplicate restatement of another bucket + +Issues for infra or external buckets are valid. Examples include: +- `infra/resource/oom` +- `infra/runtime/timeout-or-freeze` +- `infra/runtime/cancelled` +- `external/huggingface/access-forbidden` +- `external/huggingface/missing-revision` +- `external/env/missing-python-package` + +For `oom` and `timeout-or-freeze`, prefer a repo-owned bucket instead when the traces suggest a reproducible AutoDeploy issue rather than infrastructure noise. + +When creating an issue in `TensorRT-LLM`, use the repository templates in `.github/ISSUE_TEMPLATE/` instead of inventing a custom issue body. +- For failure buckets from this skill, use `.github/ISSUE_TEMPLATE/06-bug-report.yml` by default. +- Only use another template if the bucket is clearly a feature request or another non-bug category. + +Fill the selected issue template with the triage evidence from this skill. At minimum, include: +- pipeline ID and workload scope +- representative job URL +- first causal failure snippet +- matching jobs or affected model families +- likely owner or subsystem when known +- code-level hypothesis when applicable +- why a PR was not created yet + +Respect the template's required structure and security guidance. Do not paste sensitive tokens, private credentials, or other secrets into the issue body. + +Prefer one issue per bucket, not one issue per job. + +## PR Guardrails + +Before opening a PR: +- verify there is no existing open PR for the same bucket or failure signature +- confirm the PR target repo matches the bucket owner +- ensure the proposed fix is backed by evidence from logs and code +- make sure the PR description explains why one change covers all jobs in the bucket + +For `TensorRT-LLM` PRs, follow the repo workflow: +- use the local PR title format: `[JIRA/NVBUG/None][type] description` +- keep the PR focused on one concern +- validate only the smallest relevant tests or commands + +## Issue Guardrails + +Before opening an issue: +- verify there is no existing open issue or PR for the same bucket or failure signature +- confirm the issue target repo is the best available home for the bucket +- make sure the issue explains why no PR was created +- include enough evidence that another engineer can pick it up without redoing the initial triage +- use the appropriate file from `.github/ISSUE_TEMPLATE/`, usually `06-bug-report.yml` for failure buckets from this skill + +## PR Body Template + +Use this structure: + +```markdown +## Summary +- Fixes root-cause bucket: `` +- Resolves failures from pipeline `` +- One change covers `` matching jobs because `` + +## Evidence +- Representative job: `` +- Representative log snippet: `` +- Matching jobs: `` across `` +- Bucket rule: `` + +## Validation +- `` + +## Not Included +- `` +``` + +## Phase 4 — Final Report + +Print a concise final report with: +1. target pipeline, terminal pipeline, and workload scope +2. all buckets with status such as `actionable` or `issue-only` +3. representative evidence for each actionable bucket +4. PRs created, issues created, or why no PR was created for an `issue-only` bucket +5. remaining risks or follow-up validation + +The final report must also include a bucketization checksum: +- `total failed jobs = ` +- `sum of bucket sizes = ` + +If no PRs or issues were created, say that explicitly and explain whether the blocker was: +- duplicate-checks not yet performed +- evidence too weak for a concrete code owner +- no coherent single fix +- external or infra ownership + +Honor the user's selected output format: +- `chat`: print the final report directly in chat +- `md`: also write the final report to a Markdown file +- `csv`: also write a per-failure CSV with one row per failed job, including at least job ID, job URL, workload/model, first causal error, bucket, likely owner, and outcome + +## Anti-Patterns + +- Do not trust a legacy category without reading logs. +- Do not depend on `autodeploy-dashboard` code to resolve pipelines or classify failures. +- Do not stop at the first failed bridge if the real `model-coverage` failures are deeper in the downstream trigger chain. +- Do not merge failures just because they mention the same model. +- Do not create a PR for a bucket that maps to multiple unrelated fixes. +- Do not open PRs for infra-only buckets. +- Do not hide uncertainty; if evidence is mixed, split or skip. From 2fe7b1474e4416eb849d2439bfa8900c43d56797 Mon Sep 17 00:00:00 2001 From: Pamela Peng <179191831+pamelap-nvidia@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:26:58 -0400 Subject: [PATCH 8/9] [https://nvbugs/5820511][fix] Upgrade Cutlass version (#11956) Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com> --- 3rdparty/fetch_content.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/fetch_content.json b/3rdparty/fetch_content.json index c3d0e1dbcce..9b1944abcdd 100644 --- a/3rdparty/fetch_content.json +++ b/3rdparty/fetch_content.json @@ -11,7 +11,7 @@ { "name": "cutlass", "git_repository": "https://github.com/NVIDIA/cutlass", - "git_tag": "v4.3.0", + "git_tag": "v4.4.1", "git_shallow": true, "source_subdir": "dont-add-this-project-with-add-subdirectory" }, From 35ccedde586621d7d67c367bdd19f8868a567b68 Mon Sep 17 00:00:00 2001 From: tcherckez-nvidia <127761168+tcherckez-nvidia@users.noreply.github.com> Date: Mon, 9 Mar 2026 21:20:38 +0200 Subject: [PATCH 9/9] =?UTF-8?q?[None][feat]=20Add=20AD=20model=20list=20va?= =?UTF-8?q?lidation=20checks=20to=20pre-commit=20and=20PR=E2=80=A6=20(#120?= =?UTF-8?q?36)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Tal Cherckez <127761168+tcherckez-nvidia@users.noreply.github.com> --- .github/workflows/model-registry-check.yml | 40 +++++++ .pre-commit-config.yaml | 8 ++ scripts/check_model_registry.py | 131 +++++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 .github/workflows/model-registry-check.yml create mode 100644 scripts/check_model_registry.py diff --git a/.github/workflows/model-registry-check.yml b/.github/workflows/model-registry-check.yml new file mode 100644 index 00000000000..122b9d92470 --- /dev/null +++ b/.github/workflows/model-registry-check.yml @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Model Registry Check + +on: + pull_request: + types: [opened, edited, synchronize, reopened] + paths: + - examples/auto_deploy/model_registry/models.yaml + +jobs: + validate-model-registry: + name: Validate AutoDeploy Model Registry + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 + with: + python-version: "3.12" + cache: "pip" + + - name: Install validator dependency + run: python3 -m pip install PyYAML + + - name: Validate model registry + run: python3 scripts/check_model_registry.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fb1c925d5ba..687ae90b889 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1459,6 +1459,14 @@ repos: files: ".*/auto_deploy/.*" - repo: local hooks: + - id: model-registry-check + name: Validate AutoDeploy model registry + entry: python scripts/check_model_registry.py + language: python + additional_dependencies: + - PyYAML + files: ^examples/auto_deploy/model_registry/models\.yaml$ + pass_filenames: false - id: test lists format name: Check for tabs and multiple spaces in test_lists txt files entry: ./scripts/format_test_list.py diff --git a/scripts/check_model_registry.py b/scripts/check_model_registry.py new file mode 100644 index 00000000000..c14edd21e9b --- /dev/null +++ b/scripts/check_model_registry.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import argparse +import collections +import pathlib +import sys +import typing + +import yaml + +MODEL_REGISTRY_PATH = pathlib.Path("examples/auto_deploy/model_registry/models.yaml") +EXPECTED_MODEL_KEYS = {"name", "yaml_extra"} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Validate the AutoDeploy model registry for duplicates and entry structure." + ) + parser.add_argument( + "--path", + type=pathlib.Path, + default=MODEL_REGISTRY_PATH, + help="Path to the model registry YAML file.", + ) + return parser.parse_args() + + +def load_registry(path: pathlib.Path) -> dict[str, typing.Any]: + try: + with path.open(encoding="utf-8") as file: + loaded = yaml.safe_load(file) + except FileNotFoundError as error: + raise ValueError(f"Registry file does not exist: {path}") from error + except yaml.YAMLError as error: + raise ValueError(f"Failed to parse YAML in {path}: {error}") from error + + if not isinstance(loaded, dict): + raise ValueError(f"Expected top-level mapping in {path}, got {type(loaded).__name__}.") + + return loaded + + +def validate_models(models: typing.Any) -> list[str]: + if not isinstance(models, list): + return [f"Expected 'models' to be a list, got {type(models).__name__}."] + + errors: list[str] = [] + seen_names: dict[str, list[int]] = collections.defaultdict(list) + + for index, model_entry in enumerate(models, start=1): + entry_label = f"models[{index}]" + if not isinstance(model_entry, dict): + errors.append( + f"{entry_label}: expected a mapping entry, got {type(model_entry).__name__}." + ) + continue + + entry_keys = set(model_entry) + missing_keys = sorted(EXPECTED_MODEL_KEYS - entry_keys) + unexpected_keys = sorted(entry_keys - EXPECTED_MODEL_KEYS) + if missing_keys or unexpected_keys: + details: list[str] = [] + if missing_keys: + details.append(f"missing keys {missing_keys}") + if unexpected_keys: + details.append(f"unexpected keys {unexpected_keys}") + joined_details = ", ".join(details) + errors.append( + f"{entry_label}: expected exactly the keys ['name', 'yaml_extra']; {joined_details}." + ) + + name = model_entry.get("name") + if not isinstance(name, str) or not name.strip(): + errors.append(f"{entry_label}: missing non-empty string 'name'.") + yaml_extra = model_entry.get("yaml_extra") + if not isinstance(yaml_extra, list) or not all( + isinstance(item, str) and item.strip() for item in yaml_extra + ): + errors.append(f"{entry_label}: 'yaml_extra' must be a list of non-empty strings.") + + if not isinstance(name, str) or not name.strip(): + continue + + seen_names[name].append(index) + + for name, indices in sorted(seen_names.items()): + if len(indices) > 1: + joined_indices = ", ".join(str(index) for index in indices) + errors.append(f"Duplicate model name {name!r} found at entries: {joined_indices}.") + + return errors + + +def main() -> int: + args = parse_args() + + try: + registry = load_registry(args.path) + except ValueError as error: + print(f"Model registry validation failed: {error}", file=sys.stderr) + return 1 + + errors = validate_models(registry.get("models")) + if errors: + print(f"Model registry validation failed for {args.path}:", file=sys.stderr) + for error in errors: + print(f" - {error}", file=sys.stderr) + return 1 + + print(f"Model registry validation passed for {args.path}.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())