diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h index 24d34154017..21c1b80d985 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h @@ -42,7 +42,8 @@ enum class ActType // // GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0. SwiGlu, - Relu2 + Relu2, + Silu }; // Type of the element-wise activation to apply after the Gemm @@ -59,6 +60,10 @@ enum class EltwiseActType // act = relu(x0) ^ 2 // where x0 is the output of the Gemm. Relu2, + // Silu is defined as the following operation: + // act = x0 * sigmoid(x0) + // where x0 is the output of the Gemm. + Silu }; struct TrtllmGenBatchedGemmRunnerOptions diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index a84b863cdc0..0f14135427f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -141,10 +141,10 @@ struct BatchedGemmData // The rightmost dimension is contiguous in memory. // // If DeepSeek FP8 recipe is not used, but for MxFp{4,8}, MxInt4 and NvFp4 formats: - // The layout of scaling factors for A is always R128c4 + // If the layout is R128c4, // M must be a multiple of 128. - // K must be a multiple of 64. - // The "logical" shape is: [paddedM, K / P], where P is the scaling block size. + // K must be a multiple of 4 * P, where P is the scaling block size. + // The "logical" shape is: [paddedM, K / P]. // The R128c4 layout is: [paddedM / 128, K / P / 4, 512]. // The shape we use for TMA is: [paddedM / 128, K / P / 4, 2, 256]. // Where paddedM is M if (routeAct == true && batchM), or @@ -302,7 +302,7 @@ struct BatchedGemmData // The pre-activation scaling factor (typically dequantA * dequantB) for non-gated non-linear // activation. - // Only used when non-linear activation is applied (e.g., GELU, Relu2). + // Only used when non-linear activation is applied (e.g., GELU, Relu2, Silu). // When used, scaleC should be quantScaleC only, and this scale is applied before the // activation. Shape is [B]. float const* mPtrScaleAct{nullptr}; @@ -786,7 +786,7 @@ class BatchedGemmInterface { numCtasBatch += batchM ? gemm::divUp(options.mBatchedM[bi], options.mTileM * options.mClusterDimX) * options.mClusterDimX - : gemm::divUp(options.mBatchedN[bi], options.mTileN); + : gemm::divUp(options.mBatchedN[bi], options.mTileN * options.mClusterDimY) * options.mClusterDimY; } } // For MoE, mNumTokens != 0 and the number of CTAs is known only at runtime. @@ -923,19 +923,21 @@ class BatchedGemmInterface { totalNumPaddedTokens += batchM ? gemm::divUpMul(options.mBatchedM[bi], options.mTileM * options.mClusterDimX) - : gemm::divUpMul(options.mBatchedN[bi], options.mTileN); + : gemm::divUpMul(options.mBatchedN[bi], options.mTileN * options.mClusterDimY); } } else { // Get tile in token dim. - auto tileTokensDim = batchM ? options.mTileM * options.mClusterDimX : options.mTileN; + auto tileTokensDim + = batchM ? options.mTileM * options.mClusterDimX : options.mTileN * options.mClusterDimY; totalNumPaddedTokens = data.mProblemDimensions.mMaxNumCtasInTokenDim * tileTokensDim; } // Get options from config. auto& options = config.mOptions; - int const tokenTile = batchM ? options.mTileM * options.mClusterDimX : options.mTileN; + int const tokenTile + = batchM ? options.mTileM * options.mClusterDimX : options.mTileN * options.mClusterDimY; auto const numTokens = totalNumPaddedTokens; auto const intermediateDim = batchM ? options.mN : options.mM; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h index b78600aebfa..981aae7609e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h @@ -100,18 +100,18 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, gemm::EltwiseActType eltwiseActType, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, - bool fuseUtccpWithUtcmma, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, - bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, - bool hoistMmaTaskTryWaits, int k, gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, - gemm::MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, - int numEpilogueWarps, int numRegsCastAWarps, int numRegsCopySfLdsSttm, int numRegsCopySparsityInfo, - int numRegsPerThreadEpilogueWarp, int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, - int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, - int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool patchF2fp, - int32_t sfBlockSizeA, int32_t sfBlockSizeB, int32_t sfBlockSizeC, tg::SfLayout sfLayoutA, - tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, bool sliceK, tg::Sparsity sparsityA, - gemm::SplitK splitK, int tileK, int tileM, int tileN, gemm::TileScheduler tileScheduler, - bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, + int fallbackClusterDimX, int fallbackClusterDimY, int fallbackClusterDimZ, bool fuseUtccpWithUtcmma, + bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, + bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, + gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, gemm::MatrixLayout layoutB, int m, int mmaK, + tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, int numRegsCastAWarps, + int numRegsCopySfLdsSttm, int numRegsCopySparsityInfo, int numRegsPerThreadEpilogueWarp, + int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, + int numStagesMma, int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, + bool outputDebugTensors, bool patchF2fp, int32_t sfBlockSizeA, int32_t sfBlockSizeB, int32_t sfBlockSizeC, + tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, bool sliceK, + tg::Sparsity sparsityA, gemm::SplitK splitK, int tileK, int tileM, int tileN, gemm::TileScheduler tileScheduler, + bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, bool useFlexibleClusterDims, bool useHoistTryWaitForCustomMmaSchedule, bool useMaxTmemOverlap, bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrix, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int validM, int validN, int validK, int worldSize, @@ -127,17 +127,18 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions gemm::GemmOptions(allReduceAlgo, biasType, blockK, clcFastDrain, clusterDimX, clusterDimY, clusterDimZ, ctaSwizzleType, dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, eltwiseActType, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs, epilogueLdtmDps, epilogueLdtmBits, - epilogueTileM, epilogueTileN, fuseUtccpWithUtcmma, gridTriggerSecondaryA, gridTriggerSecondaryB, - gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA, gridWaitForPrimaryB, hoistLoadTaskInit, - hoistMmaTaskTryWaits, k, kernelTraits, layoutA, layoutB, m, mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, - numEpilogueWarps, numRegsCastAWarps, numRegsCopySfLdsSttm, numRegsCopySparsityInfo, - numRegsPerThreadEpilogueWarp, numRegsPerThreadNonEpilogueWarp, numSlicesForSplitK, numSlicesForSliceK, - numStages, numStagesMma, numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, - outputDebugTensors, patchF2fp, sfBlockSizeA, sfBlockSizeB, sfBlockSizeC, sfLayoutA, sfLayoutB, - sfLayoutC, sfReshapeFactor, sliceK, sparsityA, splitK, tileK, tileM, tileN, tileScheduler, - transposeMmaOutput, useCustomMmaSchedule, useDeepSeekFp8, useHoistTryWaitForCustomMmaSchedule, - useMaxTmemOverlap, usePerTokenSfA, usePerTokenSfB, useShuffledMatrix, useTmaStore, useTwoTmaLoadWarps, - useTwoMmaWarps, useUnrollLoop2xForMma, validM, validN, validK, worldSize), + epilogueTileM, epilogueTileN, fallbackClusterDimX, fallbackClusterDimY, fallbackClusterDimZ, + fuseUtccpWithUtcmma, gridTriggerSecondaryA, gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, + gridWaitForPrimaryA, gridWaitForPrimaryB, hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, + layoutA, layoutB, m, mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, numEpilogueWarps, numRegsCastAWarps, + numRegsCopySfLdsSttm, numRegsCopySparsityInfo, numRegsPerThreadEpilogueWarp, + numRegsPerThreadNonEpilogueWarp, numSlicesForSplitK, numSlicesForSliceK, numStages, numStagesMma, + numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, outputDebugTensors, patchF2fp, + sfBlockSizeA, sfBlockSizeB, sfBlockSizeC, sfLayoutA, sfLayoutB, sfLayoutC, sfReshapeFactor, sliceK, + sparsityA, splitK, tileK, tileM, tileN, tileScheduler, transposeMmaOutput, useCustomMmaSchedule, + useDeepSeekFp8, useFlexibleClusterDims, useHoistTryWaitForCustomMmaSchedule, useMaxTmemOverlap, + usePerTokenSfA, usePerTokenSfB, useShuffledMatrix, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, + useUnrollLoop2xForMma, validM, validN, validK, worldSize), actType, clampBeforeAct) , mBatchedM(batchedM) , mBatchedN(batchedN) @@ -310,7 +311,7 @@ inline bool checkAndUpdateBatchedGemmOptions( TLLM_CHECK_ERROR((options.mRouteSfsImpl.value() == RouteImpl::Ldgsts || options.mRouteSfsImpl.value() == RouteImpl::LdgPlusSts) && options.mRouteImpl == RouteImpl::Tma, - "RouteSfsImpl must be equal to RouteImpl, or Ldgsts/LdgPlusSts, when RouteImpl is Tma"); + "RouteSfsImpl must be equal to RouteImpl, or Ldgsts/LdgPlusSts when RouteImpl is Tma"); } else if (!options.mRouteSfsImpl.has_value()) { @@ -379,8 +380,6 @@ inline bool checkAndUpdateBatchedGemmOptions( if (doesRouteImplUseTma(options.mRouteSfsImpl.value())) { - TLLM_CHECK_ERROR(!batchM, "UTMALDG.GATHER4 only supported for batch N."); - if (tg::mmaKindIsBlockFmt(options.mMmaKind)) { int const numEltsPerSfRoute = batchM ? options.mSfBlockSizeA : options.mSfBlockSizeB; @@ -392,8 +391,9 @@ inline bool checkAndUpdateBatchedGemmOptions( if (!batchM || doesRouteImplUseNoRoute(options.mRouteImpl)) { - TLLM_CHECK_ERROR(options.mSfLayoutA == tg::SfLayout::R128c4, - "options.mSfLayoutA has to be tg::SfLayout::R128c4 when not being routed"); + bool isSupportedSfLayoutA = options.mSfLayoutA == tg::SfLayout::R128c4; + TLLM_CHECK_ERROR(isSupportedSfLayoutA, "options.mSfLayoutA has to be R128cX when not batch M or not routed", + tg::sfLayoutToString(options.mSfLayoutA)); } } @@ -422,12 +422,6 @@ inline bool checkAndUpdateBatchedGemmOptions( options.mK % options.mTileK == 0, "K must be a multiple of tileK when using Ldg based SF routing"); } - if (options.mClusterDimX > 1 && batchM && options.mRouteSfsImpl.has_value()) - { - TLLM_CHECK_ERROR(options.mRouteSfsImpl.value() != RouteImpl::Tma, - "2CTA BatchedGemm does not support routing Sf along M dimension with TMA."); - } - // Check if all elements in mBatchedM or mBatchedN are the same (uniform tokens per batch) and // set mIsUniformNumTokensPerBatch and mBatchStride. if (options.mIsUniformNumTokensPerBatch) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h index 8c921f41968..9e86b808ec0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h @@ -107,6 +107,10 @@ enum class EltwiseActType // act = relu(x0) ^ 2 // where x0 is the output of the Gemm. Relu2, + // Silu is defined as the following operation: + // act = x0 * sigmoid(x0) + // where x0 is the output of the Gemm. + Silu, }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h index ed50f012b86..0d4a19e89f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h @@ -130,18 +130,18 @@ struct GemmOptions int clusterDimY, int clusterDimZ, CtaSwizzleType ctaSwizzleType, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, EltwiseActType eltwiseActType, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps, - int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool fuseUtccpWithUtcmma, - bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, - bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, - KernelTraits kernelTraits, MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, - int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, int numRegsCastAWarps, - int numRegsCopySfLdsSttm, int numRegsCopySparsityInfo, int numRegsPerThreadEpilogueWarp, + int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, int fallbackClusterDimX, int fallbackClusterDimY, + int fallbackClusterDimZ, bool fuseUtccpWithUtcmma, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, + bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, + bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, MatrixLayout layoutA, MatrixLayout layoutB, int m, + int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, + int numRegsCastAWarps, int numRegsCopySfLdsSttm, int numRegsCopySparsityInfo, int numRegsPerThreadEpilogueWarp, int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool patchF2fp, int32_t sfBlockSizeA, int32_t sfBlockSizeB, int32_t sfBlockSizeC, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int sfReshapeFactor, bool sliceK, tg::Sparsity sparsityA, SplitK splitK, int tileK, int tileM, int tileN, TileScheduler tileScheduler, - bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, + bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, bool useFlexibleClusterDims, bool useHoistTryWaitForCustomMmaSchedule, bool useMaxTmemOverlap, bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrix, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int validM, int validN, int validK, int worldSize) @@ -167,6 +167,9 @@ struct GemmOptions , mEpilogueLdtmBits{epilogueLdtmBits} , mEpilogueTileM{epilogueTileM} , mEpilogueTileN{epilogueTileN} + , mFallbackClusterDimX{fallbackClusterDimX} + , mFallbackClusterDimY{fallbackClusterDimY} + , mFallbackClusterDimZ{fallbackClusterDimZ} , mFuseUtccpWithUtcmma{fuseUtccpWithUtcmma} , mGridTriggerSecondaryA{gridTriggerSecondaryA} , mGridTriggerSecondaryB{gridTriggerSecondaryB} @@ -218,6 +221,7 @@ struct GemmOptions , mTransposeMmaOutput{transposeMmaOutput} , mUseCustomMmaSchedule{useCustomMmaSchedule} , mUseDeepSeekFp8{useDeepSeekFp8} + , mUseFlexibleClusterDims{useFlexibleClusterDims} , mUseHoistTryWaitForCustomMmaSchedule{useHoistTryWaitForCustomMmaSchedule} , mUseMaxTmemOverlap{useMaxTmemOverlap} , mUsePerTokenSfA{usePerTokenSfA} @@ -286,6 +290,12 @@ struct GemmOptions int mEpilogueTileM{128}; // Tile size for the epilogue in N dimension. int mEpilogueTileN{32}; + // Fallback Cluster size in X dim. + int mFallbackClusterDimX{1}; + // Fallback Cluster size in Y dim. + int mFallbackClusterDimY{1}; + // Fallback Cluster size in Z dim. + int mFallbackClusterDimZ{1}; // Whether fuse UTCCP with UTC*MMA. bool mFuseUtccpWithUtcmma{false}; // Whether load task A triggers the next grid. @@ -396,6 +406,8 @@ struct GemmOptions bool mUseCustomMmaSchedule{false}; // Use DeepSeek Fp8. bool mUseDeepSeekFp8{false}; + // Use flexible cluster dims. + bool mUseFlexibleClusterDims{false}; // The purpose of hoisting trywaits is to opportunistically peek at the availability of the next // k-block. It benefits when the next k-block is already available and thus sustaining the // momentum, but it adds latency to the first k-block for smaller k-loop. @@ -502,6 +514,21 @@ inline std::string toString(CtaSwizzleType e) //////////////////////////////////////////////////////////////////////////////////////////////////// +template <> +inline std::string toString(EltwiseActType e) +{ + switch (e) + { + case EltwiseActType::None: return "None"; + case EltwiseActType::Gelu: return "Gelu"; + case EltwiseActType::Relu2: return "Relu2"; + case EltwiseActType::Silu: return "Silu"; + default: return std::to_string(static_cast(e)); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline std::string dumpOptions(GemmOptions const& options, bool dumpRuntimeParams = true) { std::stringstream ss; @@ -547,6 +574,9 @@ inline std::string dumpOptions(GemmOptions const& options, bool dumpRuntimeParam ss << "mEpilogueLdtmBits=" << options.mEpilogueLdtmBits << "," << std::endl; ss << "mEpilogueTileM=" << options.mEpilogueTileM << "," << std::endl; ss << "mEpilogueTileN=" << options.mEpilogueTileN << "," << std::endl; + ss << "mFallbackClusterDimX=" << options.mFallbackClusterDimX << "," << std::endl; + ss << "mFallbackClusterDimY=" << options.mFallbackClusterDimY << "," << std::endl; + ss << "mFallbackClusterDimZ=" << options.mFallbackClusterDimZ << "," << std::endl; ss << "mFuseUtccpWithUtcmma=" << options.mFuseUtccpWithUtcmma << "," << std::endl; ss << "mGridTriggerSecondaryA=" << options.mGridTriggerSecondaryA << "," << std::endl; ss << "mGridTriggerSecondaryB=" << options.mGridTriggerSecondaryB << "," << std::endl; @@ -624,6 +654,7 @@ inline std::string dumpOptions(GemmOptions const& options, bool dumpRuntimeParam ss << "mTransposeMmaOutput=" << options.mTransposeMmaOutput << "," << std::endl; ss << "mUseCustomMmaSchedule=" << options.mUseCustomMmaSchedule << "," << std::endl; ss << "mUseDeepSeekFp8=" << options.mUseDeepSeekFp8 << "," << std::endl; + ss << "mUseFlexibleClusterDims=" << options.mUseFlexibleClusterDims << "," << std::endl; ss << "mUseHoistTryWaitForCustomMmaSchedule=" << options.mUseHoistTryWaitForCustomMmaSchedule << "," << std::endl; ss << "mUseMaxTmemOverlap=" << options.mUseMaxTmemOverlap << "," << std::endl; ss << "mUsePerTokenSfA=" << options.mUsePerTokenSfA << "," << std::endl; @@ -1158,18 +1189,21 @@ inline bool checkAndUpdateGemmOptions( if (tg::dtypeIsBlockFmt(options.mDtypeA)) { + int sfATileK = 4; int numEltsPerSfA = options.mSfBlockSizeA; - TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, - ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); - auto const numEltsPerSfAInK = options.mK / numEltsPerSfA; - TLLM_CHECK_ERROR(numEltsPerSfAInK % 4 == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, - ") must be a multiple of 4"); + TLLM_CHECK_ERROR(options.mTileK % (sfATileK * numEltsPerSfA) == 0, "TileK (", options.mTileK, + ") must be a multiple of ", (sfATileK * numEltsPerSfA), " for numEltsPerSfA=", numEltsPerSfA, + " and SF layout ", tg::sfLayoutToString(options.mSfLayoutA)); + auto const numEltsPerSfAInK = divUp(options.mK, numEltsPerSfA); + TLLM_CHECK_ERROR(numEltsPerSfAInK % sfATileK == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, + ") must be a multiple of ", sfATileK, " for SF layout ", tg::sfLayoutToString(options.mSfLayoutA)); } if (tg::dtypeIsBlockFmt(options.mDtypeB)) { TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4 || options.mSfLayoutB == tg::SfLayout::Linear, - "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); + "Only the 128x4, 8x4 and linear SF layouts are supported for B, got ", + tg::sfLayoutToString(options.mSfLayoutB)); // TileN must be a multiple of the number of rows per SF tile. int const numSfTileRowsB = options.mSfLayoutB == tg::SfLayout::R128c4 ? 128 : 8; @@ -1301,7 +1335,7 @@ inline bool checkAndUpdateGemmOptions( if (!options.mSliceK) { - TLLM_CHECK_ERROR(options.mMmaM / options.mClusterDimX <= options.mEpilogueTileM, + TLLM_CHECK_ERROR(options.mMmaM / (options.mClusterDimX > 1 ? 2 : 1) <= options.mEpilogueTileM, "EpilogueTileM must be larger or equal than mmaM."); } else @@ -1312,7 +1346,7 @@ inline bool checkAndUpdateGemmOptions( (options.mTileN & (options.mTileN - 1)) == 0, "For Slice-K TileN is required to be a power of 2"); } - if (options.mClusterDimX == 2) + if (options.mClusterDimX >= 2) { TLLM_CHECK_ERROR(options.mMmaM == 256, "Only mmaM = 256 is supported for 2CTA UTCMMA."); TLLM_CHECK_ERROR(options.mMmaN % 16 == 0, "mmaN needs to be multiple of 16 for 2CTA UTCMMA."); @@ -1320,12 +1354,39 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mTileM % options.mEpilogueTileM == 0 && options.mTileN % options.mEpilogueTileN == 0, "TileM and TileN must be divisible by EpilogueTileM and EpilogueTileN respectively."); - TLLM_CHECK_ERROR((options.mClusterDimX == 1 || options.mClusterDimX == 2) && options.mClusterDimY == 1, - "GEMM does not support cluster in X and Y dimensions."); + TLLM_CHECK_ERROR((options.mClusterDimX == 1 || options.mClusterDimX == 2 || options.mClusterDimX == 4) + && (options.mClusterDimY == 1 || options.mClusterDimY == 2 || options.mClusterDimY == 4), + "GEMM only support cluster sizes in X and Y of 1, 2 and 4, but found ", options.mClusterDimX, " and ", + options.mClusterDimY); TLLM_CHECK_ERROR( options.mClusterDimZ == 1 || options.mNumSlicesForSplitK > 1, "Cluster DimZ is only allowed for split-k."); TLLM_CHECK_ERROR(options.mTileM <= 128, "GEMM does not support TileM > 128."); + if (options.mClusterDimY > 1) + { + TLLM_CHECK_ERROR( + options.mClusterDimX >= 2, "When mClusterDimY > 1, options.mClusterDimX has to at least be 2."); + } + + if (options.mClusterDimX > 2 || options.mClusterDimY > 1) + { + TLLM_CHECK_ERROR(options.mUseTwoTmaLoadWarps, "Wider CGA sizes requires options.mUseTwoTmaLoadWarps"); + TLLM_CHECK_ERROR(options.mClusterDimZ == 1, + "Only options.mClusterDimZ == 1 is supported when having CGA larger or equal than 2x1x1."); + } + + if (options.mUseFlexibleClusterDims) + { + TLLM_CHECK_ERROR(options.mClusterDimX >= 2 && options.mFallbackClusterDimX >= 2, + "mClusterDimX and mFallbackClusterDimX can only be 2 or 4 for now."); + TLLM_CHECK_ERROR(options.mFallbackClusterDimX > 0, "options.mFallbackClusterDimX needs to be positive"); + TLLM_CHECK_ERROR(options.mFallbackClusterDimY > 0, "options.mFallbackClusterDimY needs to be positive"); + TLLM_CHECK_ERROR(options.mClusterDimX % options.mFallbackClusterDimX == 0, + "mClusterDimX needs to be a multiple of mFallbackClusterDimX"); + TLLM_CHECK_ERROR(options.mClusterDimY % options.mFallbackClusterDimY == 0, + "mClusterDimY needs to be a multiple of mFallbackClusterDimY"); + } + // FIXME: this is a bug in DeepSeek Fp8. if (options.mUseDeepSeekFp8) { @@ -1704,6 +1765,9 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, "A and B dtype must be E4m3 for Meta Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), " dtypeB=", tg::dtypeToString(options.mDtypeB)); + TLLM_CHECK_ERROR(options.mDtypeC == tg::Dtype::Fp32 || options.mDtypeC == tg::Dtype::Bfloat16 + || options.mDtypeC == tg::Dtype::Fp16, + "Only Fp32, Bfloat16, Fp16 output dtypes are supported for Meta Fp8"); } else { @@ -1738,22 +1802,35 @@ inline bool checkAndUpdateGemmOptions( { bool const isBlockA = options.mLayoutA == MatrixLayout::BlockMajorK; - // Block K size must be 128B. - // TODO Leaving this as an option for now in case we want to expertiment with other block sizes - // As the user is not expected to set this, do not fail if updateOptions is false + int32_t const padMultiplier = (isBlockA) ? padMultiplierA : padMultiplierB; int32_t const elemSizeInBits = (isBlockA) ? tg::dtypeGetNumBits(options.mDtypeA) : tg::dtypeGetNumBits(options.mDtypeB); int32_t const elemsIn128B = 128 * 8 /* Bits in byte */ / elemSizeInBits; - if (options.mBlockK != elemsIn128B) + // Number of non-zero elements in the k dimension. + int32_t const nzTileK = options.mTileK >> static_cast(isBlockA && isSparseA); + // Number of 128B SMEM slices per tile. + int32_t const smemSlicesPerTile = padMultiplier * nzTileK / elemsIn128B; + + if (smemSlicesPerTile > 2) { - if (updateOptions) + if (options.mBlockK != elemsIn128B / padMultiplier) { - options.mBlockK = elemsIn128B; + // This is to prevent a bug when the TMA box width is truncated to 128B (after padding) + // and multiple TMA instructions are loading multiple non-contiguous slices each. + // E.g. TMA #0 loads slices (0,2), TMA #1 loads slices (1,3) + TLLM_LOG_WARNING("TileK=", options.mTileK, " with ", padMultiplier, "x padding spans across ", + smemSlicesPerTile, " 128B SMEM slices. Setting blockK to ", elemsIn128B / padMultiplier); + GEMM_UPDATE_OR_ERROR(options.mBlockK, elemsIn128B / padMultiplier); } - else + } + else + { + // The larger blockK (128B vs 64B) is generally 1-2% more performant. + if (options.mBlockK != elemsIn128B && options.mBlockK != elemsIn128B / padMultiplier) { - return false; + TLLM_LOG_WARNING("Setting blockK to ", elemsIn128B); + GEMM_UPDATE_OR_ERROR(options.mBlockK, elemsIn128B); } } @@ -1813,7 +1890,7 @@ inline bool checkAndUpdateGemmOptions( options.mAllReduceAlgo, options.mFuseUtccpWithUtcmma, options.mUseMaxTmemOverlap, options.mNumEpilogueWarps, isPersistentScheduler(options.mTileScheduler), options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, - /* useTwoCtas*/ options.mClusterDimX == 2, options.mBiasType); + /* useTwoCtas*/ options.mClusterDimX >= 2, options.mBiasType); } return true; @@ -1829,32 +1906,34 @@ inline bool getDoesScaleC(tg::Dtype dtypeC) //////////////////////////////////////////////////////////////////////////////////////////////////// -inline bool getDoesScaleAb(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8) +inline bool getDoesScaleAb(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8, bool useMetaFp8) { // Need to scale/dequantize the input A/B matrices when the input type is Fp8 or NvFp4 and // DeepSeekFp8 is not used. bool const doesScaleAb{dtypeA == tg::Dtype::E2m1 || dtypeB == tg::Dtype::E2m1 - || ((dtypeA == tg::Dtype::E4m3 || dtypeB == tg::Dtype::E4m3) && !useDeepSeekFp8)}; + || ((dtypeA == tg::Dtype::E4m3 || dtypeB == tg::Dtype::E4m3) && !useDeepSeekFp8 && !useMetaFp8)}; return doesScaleAb; } ////////////////////////////////////////////////////////////////////////////////////////////////// -inline bool getDoesScaleAct(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8, EltwiseActType eltwiseActType) +inline bool getDoesScaleAct( + tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8, bool useMetaFp8, EltwiseActType eltwiseActType) { // Only non-linear activations require separate scaleAct. bool const isLinearAct = eltwiseActType == EltwiseActType::None; - return !isLinearAct && getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8); + return !isLinearAct && getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8, useMetaFp8); } //////////////////////////////////////////////////////////////////////////////////////////////////// -inline bool getKernelDoesScaleC(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool useDeepSeekFp8) +inline bool getKernelDoesScaleC( + tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool useDeepSeekFp8, bool useMetaFp8) { // In the Gemm/BatchedGemm kernels, dequantScaleAb and quantScaleC are combined into one single // scaling factor (called scaleC). As a result, we combine the logic for getDoesScaleAb and // getDoesScaleC. - return getDoesScaleC(dtypeC) || getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8); + return getDoesScaleC(dtypeC) || getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8, useMetaFp8); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1865,8 +1944,8 @@ inline CUresult loadCubinData(CUmodule* module, Config const& config) // Trtllm links the cubin into the executable while Flashinfer loads the cubin from storage. #ifdef TLLM_GEN_EXPORT_FLASHINFER #ifdef TLLM_GEN_GEMM_CUBIN_PATH - static const std::string tllm_gen_gemm_cubin_path = std::string(TLLM_GEN_GEMM_CUBIN_PATH); - const std::string sha256 = config.mHash ? config.mHash : ""; + static std::string const tllm_gen_gemm_cubin_path = std::string(TLLM_GEN_GEMM_CUBIN_PATH); + std::string const sha256 = config.mHash ? config.mHash : ""; std::string fileName = config.mFunctionName; if (!fileName.empty()) { diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h index fd6c021e4f8..5631daf22cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h @@ -28,1051 +28,1165 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "b3c16468-dirty" +#define TLLM_GEN_COMMIT "b7b335a4-dirty" #define TLLM_GEN_EXPORT_VERSION "7.0.4.0.4.0" #ifndef EXCLUDE_SM_100 -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100F -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; -extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin[]; #endif // EXCLUDE_SM_100F #ifndef EXCLUDE_SM_103 -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; -extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; +extern unsigned char Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin[]; #endif // EXCLUDE_SM_103 #ifndef EXCLUDE_SM_100 -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100F -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; -extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len; #endif // EXCLUDE_SM_100F #ifndef EXCLUDE_SM_103 -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; -extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; +extern unsigned int Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len; #endif // EXCLUDE_SM_103 static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { #ifndef EXCLUDE_SM_100 -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "bc5e9a1d58573fb570bea7c17486541914a1c16aee174732b2d8d1efd07cf680", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "9b8e8af75df59539d9751042f5d4fbe9323e1c1eab402e2741ad9fa9f424ac81", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1094,6 +1208,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1145,6 +1262,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1182,7 +1300,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "0afcf4acab33fbf39eae7c8c65b99585d91fc5a1e47bed38b6173a169575bf6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "a4e5b89eef49608fdd0f8f77c481ab518a60a0d85b9fe5c6dd0fdba77fdf6e6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1204,6 +1322,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1255,6 +1376,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1292,7 +1414,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "bbd604c909f350cc92f2e364ec9419079bb2d630f5944f51b4d75f442b892891", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "5eaac110841fafa28f63c93543ac6768282fae65bc804a5427033a36ce6de383", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1314,6 +1436,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1365,6 +1490,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1402,7 +1528,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "7c911a38527162222e749808816306c2d571f119fffa6679c1e9e750807fd9a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "d712b52226313bba3467e13abbd694486b9b4102d7810d785b1902abe7aeb204", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1424,6 +1550,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1475,6 +1604,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1512,7 +1642,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "40b0c9cfb1a50d155c23f95bd55bcce2794f1b1a403e9e2ab026e4509d3319d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "399191ec0f31d00f507eca1e38186c625d78bbf45b6f70973d147e31b8d47bf5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1534,6 +1664,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1585,6 +1718,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1622,7 +1756,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "e02a8ca4d6aaa36d48255cfe348a2c67acac06a9291ac3aa9e4652a23cbadb24", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "be953b3cabcda1d98bd03882613439e271afaf9f7c80df71fbcfdb88504cfbaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1644,6 +1778,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1695,6 +1832,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1732,7 +1870,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "8bc025eae6e0365824fcae05a21f21e83336f85781c6dae34ce6428c94af5ee8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "65df3c75cc7e307f146725b30d0f33b166075fc59e8c8cdc1405aabee7e555e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1754,6 +1892,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1805,6 +1946,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1842,7 +1984,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "de06acc568c09a3e012cc865af6b9a32571de8d66df666b9aa13f0f260e41704", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "f31fe0405df81f623c9ceb71744d4a22eb924f2ec585b8f052f717dc7be99ded", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1864,6 +2006,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -1915,6 +2060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -1952,7 +2098,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "d23b61bca482e22dd327aeb90b7ad36d73dde4c4212eb11678a0e191ea27f824", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "2f17449dce0fd578e6746f141448500262b199c5667a69dfcaf52c16ec1f4c29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -1974,6 +2120,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2025,6 +2174,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2062,7 +2212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "573eaea532e2f3256f1b6b1ae4c2c0c9610b0ee91d3745b28bd9069b7f4ae11f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "2efecb61c6bd259ffdfeb77744fda2c1ac9b5a7c85370998a88ab5f3283dec6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2084,6 +2234,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2135,6 +2288,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2172,7 +2326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "b8879f801f60fff574d5f35c886bda0902bad7194aff4625d5122521b8c7c6cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "bb47d190bad00036d032140884c7f544fc33f6dc62cf544bdae96e4c07579baa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2194,6 +2348,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2245,6 +2402,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2282,7 +2440,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "5f917a5a52770feb2c51ba34d298f0fc56aeec27aa283c2e4a4ed76f3a83e696", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "8aff7f6c0df5c78f16ba31099b01d922b699eb9f5ddcac4964654fd70e3c84a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2304,6 +2462,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2355,6 +2516,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2392,7 +2554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "92202e3d7162247b4da8e674099dba08c6a78b87051ebc665140e42d5182aa46", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "1bb0d3751f13408d28f76374b0a21e5a538906b2b7c3c45a9756334dc6ca067b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2414,6 +2576,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2465,6 +2630,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2502,7 +2668,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "3fcac34676c7b71d248910601fda119245047d7a82af61fada7ac1bcd9aa13e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "2dc54be62fbd2567dc417dbd1654f41fa7c8c6194e578b02218e286dd00a2655", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2524,6 +2690,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2575,6 +2744,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2612,7 +2782,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "4d31a20074c70ded1867377ed32c4caf6978470c9a97df242c0b541751d5a7b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "220ae493f877ddc0c378b956374f65c97362bb2b47417a4d9964ecfe1073edc6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2634,6 +2804,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2685,6 +2858,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2722,7 +2896,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "bb7b827d97ba255bbec7ad910fa0c2df9910865415c63984b6ddbcdff8885006", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "df74466e39a2290e6641033ef58b8ae9ba6773bfa9106f0e4d0b59d13813eb52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2744,6 +2918,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2795,6 +2972,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2832,7 +3010,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "5f4935a5c0227578327f85a148f6ea1ac0445e12b7b39f4827f76bf89c93bfbe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "85f56dbda3ff30379a8362d28e47e165107c62d9e7ac96101dcfad5361dd0de9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2854,6 +3032,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -2905,6 +3086,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -2942,7 +3124,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "864cf66a80b45eba64b825d6b01470863bee9117e2156f9af964a258a7c5890d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "a1605981b8c2318d8b3e2217b8db2eb7f9de174f4c5275b520ac399aad67595f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -2964,6 +3146,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3015,6 +3200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3052,7 +3238,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "02954d2dd9b19e4d7bd1f35db4610c498907cb76041e763936b7f7ac74235bcd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a", 512, "dec0fdb0263f171fb6e072ab8606e27528dd828b68542ad9b39e74d71ddac875", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3074,6 +3260,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3125,6 +3314,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3162,7 +3352,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "645a89862e81ebc255b2bfa548dfeaef58ccfffbb4d8e04732117e0ea1aa6c64", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a", 384, "4061963c1ead2fb066861c537fef784e067df0b6b541a6e152d646a32f1e28b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3184,6 +3374,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3235,6 +3428,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3272,7 +3466,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "f62a0901e80121abc054ce96a3f923652295d11c7da316bd98a38ca90e6ba89a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "1e76cd99ce910c5a0e60221aebe8ff20dcec8a8c460f7d41607b27ee7c7e3463", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3294,6 +3488,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3345,6 +3542,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3382,7 +3580,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "7b976cf5ae1bdfd6707102bd06590ab0cb63041e78529b79102a9718339b17ae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "0d7adcbce4ae4d96a934650b595b39836283d3c124f3c55f8ed1e75a666283ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3404,6 +3602,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3455,6 +3656,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3492,7 +3694,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "3baf1e8b0ccdad16e1c2ce0314d098db451f0daecf1e85c3ea2e8df2bce04756", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "cc4bbc5455fd18ddbd082f1f2acc2a1f17ce9380115eec8935d96411ee61de12", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3514,6 +3716,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3565,6 +3770,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3602,7 +3808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c619d9a792e57da4604b24deec46673bf822598a54ed91eada9a62ef674e2e14", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c0ab88d91215074fa63b5ebae02058cd0e3f62d8ff1cc2248d44fb5e1f5fc1af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3624,6 +3830,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3675,6 +3884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3712,7 +3922,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "11768eb35bff09f1dcb8d619095ece09ec47c828d95835cb56c31cbe3eb66472", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "b85101f2db7fe62dd7873b41c9f67fb2c267062a6d23014ef13efd2cac6586ae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3734,6 +3944,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3785,6 +3998,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3822,7 +4036,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "0c89fe86107ca6eba47a0afed6180c1a5b80999ed990d664fc70d3665b8fc508", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c0471472b79d9a5bd8f38fe928d490cfd87776ba6974b2a7d1a619f57f4e6180", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3844,6 +4058,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -3895,6 +4112,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -3932,7 +4150,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "85c3ca6331e8ad60ffbfdd1f19f44590559cede5d4869d2f65df9ac1f99525f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "7515143de13232b72c35b7f39659a9aaa09999e793c2ca66849849b695556382", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -3954,6 +4172,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4005,6 +4226,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4042,7 +4264,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "e719e9cf9a34ee277a9f105322fd15afc3e127c111a1319897dd2efb38020445", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "bd05c50624c407a7c978933231ca4d17b7c6ddc3536bf9c194653e9b063f5f08", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4064,6 +4286,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4115,6 +4340,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4152,7 +4378,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "0be9e673362152a8c4848b73102d4fb7c4963759a0919a9284c788b1331708c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "2f5dbd9bdedaa65dcaeaf27cc0fd6c0ddf502e7bb2edec305bd4878bb8ecccb6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4174,6 +4400,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4225,6 +4454,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4262,7 +4492,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "2da2fdab64bbd51fcf7e458a56195a07cd042f79935bea281549fe7fc3c4cbbc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c423764a32353ebfdca9b6d811a3f02c2bbabbd191e990fc9d344b3fd53cd84d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4284,6 +4514,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4335,6 +4568,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4372,7 +4606,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "ecec8ac4925b18b62cf73c3406ba1d4aeae8621fc83422957cb4fcad80ea06ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "b934aa48344c6415298748ff14853d1c40c3b38ff67980afede842fba9023801", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4394,6 +4628,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4445,6 +4682,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4482,7 +4720,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "009f34105ba77411e3890368086d62550cdf40f332df53d64434fec0296e2097", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "ba9585735dcc7e11d15bcfdc088387a34efcd70bbd678e13bd9db9c3b7341289", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4504,6 +4742,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4555,6 +4796,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4592,7 +4834,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "f9557406102a5967592b6a757548a845af5cf4f167176e14eb1b16d140a626b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "b1561ccb06319de34422c48da98864772f9720fd470adbe30a85bb5b926e4473", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4614,6 +4856,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4665,6 +4910,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4702,7 +4948,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "0365ee126d50fce8f96ff3cd4f86c2c6680be877cdeb333e3f41812883d256bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "c9b41a0caa9acc9670dd8882d38f8767ec2b6e9306c019d6eb91392796b97eea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4724,6 +4970,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4775,6 +5024,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4812,7 +5062,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "7b5e7736d063a1864a3f963a4519c3657fe207eb7d4787b4afe78f4e00139ef9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "f195f2c977bd108e18651ed283d54aac3e6ff06a31a4d77e25cc0bde7b49197d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4834,6 +5084,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4885,6 +5138,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -4922,7 +5176,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "93e5b5e1dd839c4d2503e203bf2e40b7ce05f412038261c1a1d5e8c0890da5e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "68e4c21fc489519c8fed4b5627c4b9289394801adbc7b7fe422598781ebcf5eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -4944,6 +5198,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -4995,6 +5252,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5032,7 +5290,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "869b815f75181c8cb6cb2599b73d995ab79ecba422c98f3b7b37108e8a4ebfa1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "b0db3a94ff35f23cc92a84351ea5dbe61011e0324fd7960a589c134b7a5916cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5054,6 +5312,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5105,6 +5366,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5142,7 +5404,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "45fb7c23182549f79ef881efa865a8004e1dd173257c6d06c4e42235f8cb6a1d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "d129f210d5cfff1c5a214f61793d18841bd33f9fec69533024bbe58fa4478170", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5164,6 +5426,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5215,6 +5480,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5252,7 +5518,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "a606fdbbd7462bab02b1f9ca028d7d29d8ffbb50fb3307dea13e60621a65bcd6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "70902172df67a11730e3cf632a8cc6ea27843babb70aa10cc767cfc7e1b0037d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5274,6 +5540,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5325,6 +5594,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5362,7 +5632,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "bf23e5d0903d8283cb07831272309e27d0c251c9a70bf5df36f4cfed601b2a0a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a", 512, "1025c703b26a669b781b10da308ef22381ff7f1a274a8a408e1d85b6ad0754a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5384,6 +5654,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5435,6 +5708,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5474,7 +5748,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { }, gemm::SmVersion::Sm100a}, #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100F -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 211704, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "4c462072bd79230c3e63209b8bff4074d57a86d20d4213f5dc9ab5c7aac42e4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 211704, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "7a43b1b698259473d4aaaa23a7141d574a9caf3db46982a33da48cd310814301", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5496,6 +5770,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5547,6 +5824,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5584,7 +5862,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 211704, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "06fcdf41e1cd218a6e27b9b63d945d345f245b4620e96a9026f946774921f528", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 211704, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "1963ee6bd12e30cb4eacd71acc9e3e227bce8b25fdfe72d982fcba5f8dece6ba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5606,6 +5884,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5657,6 +5938,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5694,7 +5976,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "09841847b297a836668fe9bb6296d19d4e6d02245bf70e4efc4d7e667cf3f869", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d1e8cb7daf4b022164cd15aba93831506fbfe995784e7aff970e09cff33d56bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5716,6 +5998,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5767,6 +6052,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5804,7 +6090,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "456f483972ca4c8b54eeddfef052c1d5f68525767ca177653647040dd2266bf6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f313ba12828fc91923cc6fb5c7f0aab4551681d4c172356396a1d73837963f98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5826,6 +6112,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5877,6 +6166,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -5914,7 +6204,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "382b539df9c37a22a45753e26da799f70b514be404e169d2256893fb92c1a81d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "bd8a8ad2582ddec2725befa44f518f35b46b1a7e1da5289a85ddc2931ea355c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -5936,6 +6226,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -5987,6 +6280,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6024,7 +6318,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "209ea07bb19fd3e9ee16434e166d7c8e87d0a9e590d92c5c329d7b3f486fd84c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e5f0bc6c4a11e1c950c38dccb589a3cddac97188ec2176d1e329052ec28dcebe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6046,6 +6340,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6097,6 +6394,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6134,7 +6432,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "28a1e22576cb60f6855f30e5ddcd22f29d8221fad3a8ab8f1c3c0fec9bf59f76", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "45ae59c7ab527c5c5f175bc4adad2992c87de5f8d4eb606eb47ea44f2e034c37", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6156,6 +6454,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6207,6 +6508,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6244,7 +6546,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c54bd6682625d6ead59e3e70569bba68a5343f32c8e905d27fd9fe9938170268", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "72cd5e80f666ddbaf0816a7e1b80479b23570bdcd37e49bbd8133f7fd8367d17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6266,6 +6568,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6317,6 +6622,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6354,7 +6660,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f75853bde81a2ba216b9f2957dec79ad776e5f7fa979f529e77f679f0b6f008b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "46302945003b2dc22df49a4dd2814a340b2b4a1cc0c5a15a1c08aa257e7e799e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6376,6 +6682,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6427,6 +6736,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6464,7 +6774,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f8571debf1569edea0756a3cb365ee1d823e4176d33655175f16328ee6818fc6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c9890608c86a2b9fa9f72a9a2bc0b3751ad86f9c4693df1fc452d84b6227b3c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6486,6 +6796,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6537,6 +6850,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6574,7 +6888,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len, 219800, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f", 384, "8e9d6e30d63c7310a399e2870d08f7ff6220c45518e9d1c080ad8e6312a3bb6a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len, 219800, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f", 384, "b655bbbdb07a6f610d3992225da83155cdb9242c44494d28acbfba71fae632af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6596,6 +6910,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6647,6 +6964,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 @@ -6684,7 +7002,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a34c102742981bffd86969beb1e1473f48b81c9c48e49197a4c362d231e9ac42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9c7b741d90d402975f6d58dc75808dc292cdc79f6f931fd098e92b290e58de47", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6706,6 +7024,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6757,6 +7078,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6794,7 +7116,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "bbf8ec44483e345f4b1b1701fcc9a049365dc781b1b4731256cecf40acf29aee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b344744b0ee91cad4c922fa855db736b179526ec21ff66436177618fa265cc8d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6816,6 +7138,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6867,6 +7192,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -6904,7 +7230,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "905f395779cf7cd102e3f20f5ca605e826ae1da8ac98b816e6a8d9a45d583e97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3b0db8ee858e6f3b5dde27758269212d5903a48a3b68363106f7030f17fdc9d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -6926,6 +7252,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -6977,6 +7306,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7014,7 +7344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "81b413fba90adfa18c024da7e37e415b082aff53c10804d8dca0735123241bea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "dbab80a9f6eedae187ce4a0c2c9c3cd6eb34fef3e7e7e3f757765f27d59d9bb9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7036,6 +7366,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7087,6 +7420,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7124,7 +7458,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222968, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "82ee51c8d8f4cdd949cc92c5661ad0bbb813f6e9b89fa0d6f483a93f4b1cc31b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222968, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "070e2c48dddc5c7e06c3ef0b028d5dd6a2870995901c26af2fbdeeb15e044195", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7146,6 +7480,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7197,6 +7534,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7234,7 +7572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7e811f617211f842044486f8fcb33dc383ab33b691344577c6e36af3e56c677b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "37a38c301d6041eb278db0af21780321209084c4b9730550e0965d59037203c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7256,6 +7594,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7307,6 +7648,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7344,7 +7686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222968, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c34f5227dded240822293c6aefba1c56e113cdeeff902456ec794f98f20061d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222968, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "bca5fabaf58b8df0097c80cfb3044cc35088e8f61a4a07d6cc394dd3b3704757", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7366,6 +7708,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7417,6 +7762,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7454,7 +7800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b05532c6ff1fe3f5e49c36e77599f837f43f65eea592330f32fa99d2851acdf1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "aff87782620ab5f6b09fe3d499350e6fa9b362ba80abc45b3964f5d084747daf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7476,6 +7822,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7527,6 +7876,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7564,7 +7914,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 210584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d5b304081c4bfebb15054d13a8ab2e6365843b9637a1379abb1249e0b8cb1c8f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 210584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9a0079b6a4b67746f4e93b083ed774824e99abeea50b19168ad78009889ac13c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7586,6 +7936,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7637,6 +7990,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7674,7 +8028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 210584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "added3d481463811b30f8055b482dcb92e65059ff6a2f6fa27ea520250ed028e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 210584, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f2e2bf90fad2370e5f26b740689acbb61a32da501e67dcc240e48be326d291b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7696,6 +8050,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7747,6 +8104,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7784,7 +8142,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184432, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f67c6796361acf2f26c129e6facd21b244310d5ca0b52715c0fcdcc5b8868f19", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184432, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "92aef2e5fcf61530ce75048088bef709fc9f68e2cd91de8b16992a09d9f82622", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7806,6 +8164,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7857,6 +8218,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -7894,7 +8256,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a3ad5a694521b3f9455e33571a9e3ff254210d1b78eb85e57a49180311a58f7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3b286cf2a0dd2bcd4ffd35485626a92087315ba9543645abb1af9338d9a884e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -7916,6 +8278,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -7967,6 +8332,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8004,7 +8370,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184432, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e4191ed005d6b44d72e063bf59135bfc59bafc1be7e2c42a041a9c44dae0f9aa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184432, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "59b5dd42eaa934c7ecff176aea2acec0f5a549b9f43076422b2f229aaca3d9f2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8026,6 +8392,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8077,6 +8446,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8114,7 +8484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4f0797cbb4f470b1e8cd21c728e178c28f4f7e7fb3225f59da4d82fe72a52867", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9d934a507d9cb176086502c9abc9fc4161ceadadd25b29f828a1dbcc575ac2c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8136,6 +8506,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8187,6 +8560,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8224,7 +8598,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "ea111437a04eaaeddd626e2e33895a57005501793264f37056529f8ee2c22e23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "c21434742ae17ee1ec25d77d77e13ec2c07a5ec8db9ba2e81c527b6e8206db0b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8246,6 +8620,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8297,6 +8674,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8334,7 +8712,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "709ed8ba216b7e6b3d4a2157700b72da92337cf1c81818043232abee9b00a5c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4bab230d1bf19c6f78935d46c15d3c911e1db0c34c332d3d4923edf5ef1db49e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8356,6 +8734,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8407,6 +8788,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8444,7 +8826,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7e5547497b434fd1bc8029d34f9d7a544ceb22ec065f985fc307be4f7908b569", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "fa6d83a3fc7171727e1278bf90f8a0430c8ed74f52db1115abde9c72c35940ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8466,6 +8848,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8517,6 +8902,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8554,7 +8940,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 209640, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c007c728f3b5c2a3acb58c74dd0f0acac2aa0d3a549aa9a26a35f5a068d4d885", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 209640, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9362d314db81551adb2a323dd5874e5ce8b88a26dd9573f7e0ff016cf993bf75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8576,6 +8962,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8627,6 +9016,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8664,7 +9054,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 213736, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "40d6ebda2c2bb8f1f09005982ccb4f64174982b8d137f22ba99820cf91965f8e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 213736, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4318283ba311353a206ad351e9bc7e1eba882fc0e3e00cf8a9895086a4e4f95c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8686,6 +9076,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8737,6 +9130,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8774,7 +9168,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217832, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3dfdf51e11cadcd51fbdd8dad85f3f0c57b05ea883c18ae15c084fea3ae262e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217832, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d4b88be103ca399261c8d0d1c3ec309550f611a8b82a9404ae7c74b432e7eff0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8796,6 +9190,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8847,6 +9244,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8884,7 +9282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "23a7d3f9dac9a32c49107460e73ba330b8f69bf55b1015d79357eba928228762", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "875fdf51ea1a434552dd37fc1f4bb828c4823c08b1543269df4e5242deffbfbf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -8906,6 +9304,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -8957,6 +9358,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -8994,7 +9396,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a149290b4594cbc1a6ffe8935e02d4baf56b51ef01216bc658eacdf030cbc912", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2b94b8207efe9f30fe8f268410c445502b9bc9d3924529aae7998841d8f41a7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9016,6 +9418,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9067,6 +9472,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9104,7 +9510,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "903d99f582c780854c0272a8c8c438a9fd411e2464f93b19ea0cfe233239fddb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d82c1a61ebe47d4f1abd1a3b159c5bcadb7b4c50d93df6cda1beca9b174b822e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9126,6 +9532,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9177,6 +9586,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9214,7 +9624,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "0803136a8f86c97c34663bbe8410b8a4d4759f4c36bd789f8a1f584c368e28f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "7457cae55bfa6fd28e69665a865650708963fb9bf209e95898ff9195c80d5308", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9236,6 +9646,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9287,6 +9700,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9324,7 +9738,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "1dbd5f0672d965b488e021bdc3b1260d834c0822b3cb738aa59d5431635fd96e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "2c0c7a0b8f3a9dbec723cf8e1efa27cec4c52acf6e3fd3b06ef98cd551c8c212", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9346,6 +9760,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9397,6 +9814,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9434,7 +9852,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 185848, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "3fd3868c93ca651676139616c8bc13255b03e75f63c809c560d724bf87200c93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 185848, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "bcacdf193acd22502f47da3c4e3078dd9ba0c544afe289d6272f12d9f7adf228", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9456,6 +9874,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9507,6 +9928,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9544,7 +9966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "495cd84b3379c587d0a0f6a908cf861670514cf58a3b1eb21479ee40a9e62091", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "40f68e76c683030fadacf8b886049462fc28fe09710c318337e383777a10db0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9566,6 +9988,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9617,6 +10042,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9654,7 +10080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "04bf66cdbebf70f3a9d2a589f14c79f28c84a0982c8114eea99613e75fb6c4d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "dc1125453ddee76f883dd0ac968e935822531a4afdfa015d46fea30ef123a08d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9676,6 +10102,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9727,6 +10156,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9764,7 +10194,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 185848, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "2fee557597fdb35b51a1c972d1230fb53714616c41272806abc7eef6cc72984d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 185848, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "d3863f7876b8f5f2251a2862903b9cd0c4d4d1a8fd108af3af2e275f5960fad0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9786,6 +10216,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9837,6 +10270,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9874,7 +10308,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 121264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "9ee8b8f337c843073ba17311534e55ef02cfa006abcc5a66739c2b745f45a0cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 121264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "1d92d451d88836268b794b1d22666dffbe3d004983882db3fd70c4a9829359cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -9896,6 +10330,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -9947,6 +10384,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -9984,7 +10422,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "73b7c8d58c1eee631743a6b3f80e64b605af04d15e3f1f2eb08b5ab1f30b0167", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "02813c7a5a6c032f8ccbc29b630169c201e09ec57cde4b1aa01c41188ba48a74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10006,6 +10444,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10057,6 +10498,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10094,7 +10536,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 121264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "860734c4bc2b0d67b0e0627582b6791f932a1382df9a251e95bbf3e04005e989", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 121264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "709f152ad27a81092d44c2386da9b8cd8d0009febaeb1ce3600710f01dd88bdd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10116,6 +10558,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10167,6 +10612,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10204,7 +10650,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "2e3a448a8dec61e7eeb21ad26e5a7c4d1ea6108417cc86bc8a134ba80f4b8b58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "bcf0f7235168ab03f5f93613f4377dc224f8fd87db6efecd3626def16c864f6c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10226,6 +10672,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10277,6 +10726,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10314,7 +10764,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229840, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "0e44b3e85f7c97a2a4468f1669dbe64f58c9a6f4d9790f1f4ef1887d93a9f1e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229840, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "2c9bc9f4b8a9ff1645dddf4283a836899d68153c8697f1ba9fe448dacee05726", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10336,6 +10786,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10387,6 +10840,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10424,7 +10878,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "93705ec342a13dbdb8a0b34bbf11b3664c885b1913a178bc6860ac13a0088d85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "2dc185cc8ba6e34964afe2254711dc1370c93f273c493e8219ee49dfec2472cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10446,6 +10900,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10497,6 +10954,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10534,7 +10992,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229840, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "51441838147dac45da6c5e15e9fc74e3ad16cbcfab58e20b64087265e5d688a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229840, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "bb84dbfb30122f4995daf16debf77e361275ab673e7408e6f61704cd4837c477", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10556,6 +11014,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10607,6 +11068,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10644,7 +11106,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "44769852ba406f70f365ccc73577c16cc27ad571ff718f60e7015ef591225179", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "10c6a725ea9c408621a8d2952209bd1184f1d5e9cec30fb52f0d87a2f0cb8627", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10666,6 +11128,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10717,6 +11182,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10754,7 +11220,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215544, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "89d93dc9c518bbd9e89836a6f7d8de4ed840de8db5e3a8ff074d58e5742f8247", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215544, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "b77b206a4f3bf6ded291976f7c31e0bd65ebf9054fd27a8403ad266e587b5f89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10776,6 +11242,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10827,6 +11296,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10864,7 +11334,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215544, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "792ef43d508959d8793d0e5997722710b9a4398e4b9c20865dfbf8b51cc6110c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215544, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "6a36658210ca3ce32903749d31ebbd50f7dea450afe4c76f4ddb978dba876b59", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10886,6 +11356,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -10937,6 +11410,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -10974,7 +11448,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220632, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f", 384, "8cffc046f4e02e331cb17e206f0e954e144f8e7a0cd806ded8910f1ca572ac2c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220632, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f", 384, "22ee142006a7836d7af4d834793fe71d774bb48508ef258b069e36f97951b5d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -10996,6 +11470,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11047,6 +11524,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11084,7 +11562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220632, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f", 384, "e63c04918657e51e7594ab40abdbf48b0b7dc0552a881ddf27ec8104b975172b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220632, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f", 384, "9639ce4803fbf4ba7c286a7db6be7921b001c4b3dc1c06d845b8e8a76e8f6118", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11106,6 +11584,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11157,6 +11638,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11194,7 +11676,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137648, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "7af1d4257c766f602b9e15085e7ed506d3a9051a2c42196285316f19098bf14e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137648, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "f080f7247441b9901c7bfb95207502c753f2880aafb831151735e0da4962adee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11216,6 +11698,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11267,6 +11752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11304,7 +11790,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "c2de9435ae086c8173b0d398a94bb1429455ff6aedef0aeb92c8520a04e697df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "2497d0d3793191281bdfbf647acd669c1dd3bdf573fa354d347564eedf412b85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11326,6 +11812,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11377,6 +11866,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11414,7 +11904,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137648, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "09d857159c1e53a3b3bcc13218fd239080d65ea0ecd02582722cc8ff148486d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137648, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "580ca9d9d00b141203ac0f8a56030780fbdcecfebb09dc11b787d6c433b3b5a0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11436,6 +11926,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11487,6 +11980,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11524,7 +12018,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "92189fd74803a1fbb638bcdd55196efb53fb566bde053ad6e9c7a86d2486837a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "30c7b544fe9e80ab5386016a8e1ff86a373a42e23a292a95475bdc33c0e5a4a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11546,6 +12040,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11597,6 +12094,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11634,7 +12132,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217520, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "5cb3dab2ef7e2f175bf347456446a0a05e57a047774700af6cb222ed1d983a29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217520, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "9ae15612f37b46e71cff5a5910841b1cb04bbad3f6e4754b6ac9a832390708b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11656,6 +12154,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11707,6 +12208,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11744,7 +12246,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "0ff6b702a953d89e66a9d91dcdfdbb27cea7b85b9a7950957d55cbfa6c23e69e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "267a04b39f69b08c3e33008ac4a3afe7096f4ec1f98b0f50793cbe6118c997ab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11766,6 +12268,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11817,6 +12322,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11854,7 +12360,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217520, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "a82d62b874f9f52a2b6def073658bd5e15e5a4ba24f98e7fb743c8be422dc042", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217520, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "d6d32b642894499374e868600bcd9d7f8218280784d45733f3dc6567d01124c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11876,6 +12382,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -11927,6 +12436,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -11964,7 +12474,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "435055cc40fb550681591584c8e3ec0b6934116a5e05f46f9539a36ac54f2266", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "3dce960a8f28e8ea852f82e45aa4edfa65f8a7f1b838420f0542d0c0e15931b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -11986,6 +12496,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12037,6 +12550,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12074,7 +12588,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172208, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "68946c1276e51ee06ba1a06c5d42cf70af634d2c0a63d0f971b3c1550bdda90e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172208, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "3774a748843badfb9318cf7298457be136a3effd36d7b4dc4ee5986875f355d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12096,6 +12610,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12147,6 +12664,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12184,7 +12702,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "ca3c3adbc3e656839624ce2f124d0efff02c579ec08a372f54e0075d43305668", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "6af5b157aa7b1ca96e40e560b73b25f3b09e68b68bc95c7f62a7241e74d09201", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12206,6 +12724,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12257,6 +12778,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12294,7 +12816,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "2e907155fa74dd305d0e06f50af31a9af8a067a6757c27023f900f848e15132f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "6a96bbd9e790f9d7def6603ed7093fee9c7ffa93ca13859fe27165fefe43ce51", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12316,6 +12838,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12367,6 +12892,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12404,7 +12930,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172208, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "e9f38f13002f7c2af12fc5e3f1674de1357b7135a2fb7d89a08d992c55eb746c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172208, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "a4b91b8d36332a3d2f95ee71e568af60cf8a15a73943a08d06ce2b5da9a10576", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12426,6 +12952,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12477,6 +13006,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12514,7 +13044,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "4cc7e0be049ed6105f92224f131e5796dd7f7bcdf8c395bb82ca884c65868cf2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "21ce95a00368c18a4405e0f1e6c338e04324ec22757698a787e99e4a87811881", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12536,6 +13066,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12587,6 +13120,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12624,7 +13158,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "949fa126b88d4ab80de87cefbf3691ab3cde5ba0069454d4f56c32f20e0a9a6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 184856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "f422820c9b19c2d224fdebc84fb16a5f076065640b9eac4aba3992070e11a034", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12646,6 +13180,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12697,6 +13234,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12734,7 +13272,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "0228b8628bba8677e48228382ce73af40d5f9e476028419eb610057dd58203c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "d3e09597919a7dcc5a8e86825578a45450b31d53fcd5adbb4542b2a3c2a82e59", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12756,6 +13294,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12807,6 +13348,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12844,7 +13386,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 148240, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "163ae0618d252e6d38f9e32afb67507951ef3d2844bcf338011db07a1ee449a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 148240, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "8f273834ba9de1c84b5708cc2a3b841c7354f3f5c0969df9f851981283949f96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12866,6 +13408,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -12917,6 +13462,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -12954,7 +13500,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "fe4087a0328effd933219bfed6cd6e0df79f36205753ccd11be02a61a1f3d0bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "fbf6db6b83ddc1a78e46d9ea0e23b84dd38c0aaf4c7e1f2dab0aeaf97e9f709c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -12976,6 +13522,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13027,6 +13576,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13064,7 +13614,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "69bc1e7c266f2084daff322af2f477e0c035bfaaee4e6f09291c53f650bfd8cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "519b8bc18d56715d82e3dda4d60fbb005052266b98d04178ec8912adaafa2107", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13086,6 +13636,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13137,6 +13690,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13174,7 +13728,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 148240, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "165db8a353f546ddbad90c989ef274e2cc9e2de94163a256952b6e05332255d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 148240, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f", 512, "99d0ffd11a984bcade478ab1b04c6eb09a597de15d8d15d0bc3e10d06098ac58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13196,6 +13750,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13247,6 +13804,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13284,7 +13842,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "d3baffa228944734d49130c835d7b25f3b5f7f1b2e6c240ad2d6050b969a2c48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f", 384, "f9876605e77760b82a6e1672b4d27cc8fcdd866a48e0e883e5ec2d504ddc37bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13306,6 +13864,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13357,6 +13918,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13394,7 +13956,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "f3c2524182bc50078e5357eb71731707bf1dd30efe8b69a6c7cd4b95657092f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "2f8feb889334608650b16b2767df326b02ee1aa8f06e12354d5c920b3f06b998", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13416,6 +13978,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13467,6 +14032,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13504,7 +14070,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "2da4c64a5fbea5871ee39877165fc6a6cf22de6d42c58d8984cd19b8c6b75247", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "c53a5ecc7e1a53783cdb5b6b0f980b31b4bb551c3e6126c569cf0938300d72e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13526,6 +14092,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13577,6 +14146,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13614,7 +14184,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "645a0c5eda479b4db08221e6b0c0567ac8b457f05a402fd27da858c344236519", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f", 256, "64d988efdc5a21e5aa409014d9d368e8873dc851b02a50cdcbfba32e831c8152", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13636,6 +14206,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13687,6 +14260,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13724,7 +14298,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "0d9fd887a12f988fae46ffd789cbb6094d2a1121e26f99ebb29365ee341caf5c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f", 256, "9fc7b945ad3d52ff6f026928e80d03e1773bfa31a0e5dfeaa0f26791afb90725", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13746,6 +14320,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13797,6 +14374,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13834,7 +14412,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "6d632e3c91737e5e0d7a1f5b090d768be5186acf25403318067194c862df4056", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "4b9e65a29bb4a9dd96343279edf60708111381e1ae269d1e82159921e6199f03", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13856,6 +14434,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -13907,6 +14488,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -13944,7 +14526,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "41d540b9837a53ce267412d51d87690883c589412c68952aa31b011983de3c15", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "c277a0a577ecda2fede1ebe700699e66959b8bf34550cb346fb780b9f3f234d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -13966,6 +14548,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14017,6 +14602,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14054,7 +14640,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "8028f7044315d9db113e9e25e12ab8830fbb4cf6bfb6230741fad9510aca16f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "0f44efa1339f57480bfba5e9e01c95bc5f5bab340fbc643381bfbddcb11f6fb0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14076,6 +14662,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14127,6 +14716,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14164,7 +14754,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "e697e45a7aded5d95052ef7121aa225c1fd773499cc455ab4966a5de6085b217", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "e33e84156301e1232b0b9ab850e44cc87201b98d906ce0f5bfe1faf86c16550c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14186,6 +14776,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14237,6 +14830,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14274,7 +14868,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73296, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "20aab5a9d324b7f5c3c1f61786a24ef2e0ccbd611aff4793edb5ef0528af2afc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73296, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "23e6879f5682da3e1f59ae8b00746bd7ec41417245d0b951b5a8a77c09524174", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14296,6 +14890,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14347,6 +14944,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14384,7 +14982,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "5c4cd11d88e82457667572a82bdca59062d9bc81e2c051706a8927bf55a3cc50", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a5eb733a3aab4d139a26faaab0eb2e845695f9c005e563b5623a674096d0f680", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14406,6 +15004,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14457,6 +15058,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14494,7 +15096,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73296, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "6d7efe60853d043cdf209a0de77466425dbe4ba76f1efec78df4698f30ee7f2e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73296, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "19daac3f0534632794d5e92e45d90b2ce189a65b6696cb427f51ab8ecab2f8e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14516,6 +15118,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14567,6 +15172,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14604,7 +15210,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a7c37f21e95f5cbeee57c59eba577bc941bb82329d4ca5e3de91b8c9251e78af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "219f31e6f3aa0f75f8f51de901b41c4957a79c53bf2ba318d426fd6075887cc5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14626,6 +15232,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14677,6 +15286,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14714,7 +15324,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "96e55f596dab7810031cd52480bc7f0b47a1fffcff99238d73bddf66419bf3d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8110128fefc020ff91e0a4544c9521aad059b002f2d6ddf3e5466c9f8b02e26a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14736,6 +15346,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14787,6 +15400,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14824,7 +15438,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "ca2221abfad0b20ec3249df74698c4b9aaec2cef30f4dcf9a95b872c2513f00c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f478b64fb7a249d5bc0172d722ed4e09bd8a7fa95568b15b1f0defec3b6b01b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14846,6 +15460,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -14897,6 +15514,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -14934,7 +15552,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c4fa06c68d1c0d08f359847ed23c3279c83cb8d07f0b0aaedfbe677fdcf1bce7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f4462ea202b088122cabe88cf0540877e047828572d6859eb0ffb5f89ec3e9fa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -14956,6 +15574,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15007,6 +15628,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15044,7 +15666,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "957b078f4bcb0789a8e2b16aa5464b8f70b00b3daf361174f942ea32f851e0f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "962284c3315c8612cc29fec7dcc0866b9c643b4b7905c2df92847f1a3c467c4a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15066,6 +15688,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15117,6 +15742,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15154,7 +15780,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7f4ff0e696db70beef97ecf7a1ba77a2370d13dc973cb6e683435abdaf78d8ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "47b2d58c75b1ae744a8f706a2f82d35762079cdd920fa68d9f79bd06add94bba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15176,6 +15802,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15227,6 +15856,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15264,7 +15894,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e3ae72bc5d77684de83a4d7de04f0052320b71db6b651da0f389e19de0052e88", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85456, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6094bfa636eec622c6027a7e47c5b60f3f5242ce0fb432dff5007868e5d44e59", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15286,6 +15916,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15337,6 +15970,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15374,7 +16008,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "71497e02845979048b72e03619e26db05f47e942f486d31844c126e40d2ce9fe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a35a768b6f7edadd07a5c13e455ebb15ad5cd4f80558244fcde6de08e52b2474", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15396,6 +16030,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15447,6 +16084,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15484,7 +16122,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "14de1f999958a579ef2a0c09a75582fca9af0e5a0d94682a2ad680fde28753c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "e4c41de71568ba09493c833635821599c97533af2784675aab5d51f8d3632d8a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15506,6 +16144,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15557,6 +16198,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15594,7 +16236,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97872, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "4ed3f82f32d486791cdfad96c790c3f7e386962d9df33f4e29e4be43316e90ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97872, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "d612f542487d0a5a44e3adb43b91443235544c4e4ef08c858a44d61cbd9209cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15616,6 +16258,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15667,6 +16312,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15704,7 +16350,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c0e3bb9e807761267266e52d7b29618b49c1222abdced8a2167f6dca0b94f45d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "d412cb418d5b77674a0506490819f225e336c74311f4f31bcf8ffb416c56e1d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15726,6 +16372,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15777,6 +16426,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15814,7 +16464,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97872, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f34cb831ebbd0e771c975d5d12463990a2aa3f43e96a99b3c6a341d4584df7a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97872, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f34ed1401ace3ea024983cf2d236319794f7c5ffb0df98f163fe723eac68d3ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15836,6 +16486,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15887,6 +16540,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -15924,7 +16578,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "78fe4b003a2c08564321b19a46c241ecd4575906bb0644bf8fd2f54fba59b0cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7683e58ef9bd4bbf989c604df802673ca5dfad80c80f50bca05acfa8fed1cae5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -15946,6 +16600,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -15997,6 +16654,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16034,7 +16692,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "82f0e36d35614a603908baa85a3b59a0509c7560e7152ff553ea5bddc74aaf4c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "ab3762e4eecb847752142ea69e200fbae45fd8ffee64bf2a64c6c412a5e40f4e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16056,6 +16714,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16107,6 +16768,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16144,7 +16806,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "16282aaeb68078492025a3585fcd24fb72e9ce5821604dfe201ababa49153659", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6c1b89b2724b6947029074137da3fb66d3a894759308802dda011b7ba620da54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16166,6 +16828,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16217,6 +16882,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16254,7 +16920,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e10c0224b9ae40f181b03646354274acca5e338bff1d4c874036ffaff616c628", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "acecc3720906a3c99b6883656c2710a43ab67c9d2c7f4f789c1059fc1e95ebf3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16276,6 +16942,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16327,6 +16996,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16364,7 +17034,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "594d233181fcf06bffd60377976c0fb216ce18ccf23e807c7fd2bc1246474ae5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "b7e5d825bf16d396b1e26641f71cfb3626f303d2de193a24f2705816f1c4199d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16386,6 +17056,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16437,6 +17110,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16474,7 +17148,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7fe5e317140b4abf908d57c45dbdc7bf64062b7413404ee70f3f45545dc8013d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "03008dd9e3361e476a10ba22f53e745da5c0f33acf905c36df2011cb0e27d44d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16496,6 +17170,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16547,6 +17224,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16584,7 +17262,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "108e5fa414e2edf1d018c4223fac21f4dee2293a30fb9341d0eae3b262e6aa90", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 114128, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "ad0e3e79fd6ede2187e363740f065cefccfbd83c100b8c984b9e4d5bade6db06", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16606,6 +17284,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16657,6 +17338,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16694,7 +17376,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "dab9488d409915f0a0e9f41f5ce1f921cd6213d09fd11405c91799a0d6be246f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b51065ec78b6da570e8f7342f4c8090de16dd94e923d90911ca4d56ff8667fa1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16716,6 +17398,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16767,6 +17452,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16804,7 +17490,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "95a36353da567e414f8d58b16775c22b6581154db21b43532f61cca622d7768e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "06a68fca6141ab75f3a832b445b6e73a2621d2cdc6c1584e7f3f05406ddf312a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16826,6 +17512,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16877,6 +17566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -16914,7 +17604,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 148048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "276e6544b5a5e789b17fe270f0412e0e47d484a400c294bd1c593a46d20031d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 148048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8d5db6d22e534c02a9826d14e2644fb8de8b7dc2d257a8903217376e4696311f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -16936,6 +17626,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -16987,6 +17680,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17024,7 +17718,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c7d9ba96bdf692e01310aa34cf61c9beba3e505d59772add5988e0ffbd46738d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "0ecc0f2af9c37b4dd53d4e2088cde06a712866bb190a83003640667c200eeec1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17046,6 +17740,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17097,6 +17794,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17134,7 +17832,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 148048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c32adc6ad64bf73948a666fe560bb2eaa210eaa898132428a7be923fd165d1c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 148048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "886c05d53173b4bc1ec4711f51ce3f4272f37c495f246606011622ecfda732c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17156,6 +17854,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17207,6 +17908,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17244,7 +17946,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "dda1f4dd9aa0db9c015290540d378be934f3108c983f3c90474bd045977530a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "db1105607066af2acc7bc6f1a44236b2f8273ef0cfb576f5f7683e81a5770318", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17266,6 +17968,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17317,6 +18022,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17354,7 +18060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f3a4db69b502a9ff57b8d007cd0ccd2df738d8bdfea027b5557cffd583a23d52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "bea95a5ca490eb16c159b1212ded860e451a518f36d2f6c0142351bd7bd460b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17376,6 +18082,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17427,6 +18136,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17464,7 +18174,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f3133904382bf6861c8fd49caf898b5d3bf4ebced22dd6467e0571c5c9081b15", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4bd3fa3be9ccd6d292e11db708a84567d090a36e712f0f86b8ebddf7d9120878", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17486,6 +18196,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17537,6 +18250,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17574,7 +18288,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2a7555f2a67bd824fac43461ad3c31c9466f45fbf0cd89b8321d331893934847", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e998ed66b7319f512c001370d7d4e8aee1637e5b4d4770ce11016ca8ea6034f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17596,6 +18310,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17647,6 +18364,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17684,7 +18402,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "cb386773aa2c3faa73a5a14fa95da6c73ad13feccbf7d472ce0173d20eba1db0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "37b6fefd7c5520d9e42717b28189fae249a80572bd7786c4a798b560068054e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17706,6 +18424,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17757,6 +18478,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17794,7 +18516,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "222dec48978238606f92b0484eb0e183675f4dea271ee85e8624734d03958a7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2d137e1b53eae5f55fbb6f85e43228ae8121999e27221b7a30c9c6f0a88c537d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17816,6 +18538,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17867,6 +18592,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -17904,7 +18630,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "82716bffa0f69e7b6e7b6d89a5789c2a0d8a75334371af664b48eb06373d1659", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172496, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6cb281661d9792f6f1a789d5611dc8ddbccc94513cb78b2de2484e5f4040878e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -17926,6 +18652,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -17977,6 +18706,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18014,7 +18744,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8317e5b5951fbab83d1dceb7440d9d64150f745abd4d6baa5170c9f14d6ecb48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "10a3b555a8cdda63a4e4a5c4f53cc210e5f662e2acd2004b96df6ed74873df62", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18036,6 +18766,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18087,6 +18820,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18124,7 +18858,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "cb349d0e2071c1ed6275d76a557ba8e745852a3b9f86be9b6c6d372a7567992b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "1fabbe9251d7e97636322670bb9d6892c034c7a7ef29c068aa170084c65140a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18146,6 +18880,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18197,6 +18934,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18234,7 +18972,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 61008, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f471163d75dabe7d07acd2d07ad9a0a502b1d37caaae0200d723512becf9c1cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 61008, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8d2e75d8cd37b0b41727b75722218e5004c29bb77141293a547f333ded7107af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18256,6 +18994,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18307,6 +19048,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18344,7 +19086,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "330af66a59b02a7d6c80d13933860f8564479075408e20d7366d115452b3568c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e796d504151b5150e4bca77b198fe914d1368bbe6780b4cae95dcaf23d0c63c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18366,6 +19108,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18417,6 +19162,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18454,7 +19200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 61008, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "1890c9310a40ee337a404455acb8665ea7907195fb7a6eacd76ae62e104b5f53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 61008, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "25fb2f38d1520631d1ca1c284092f6d34705a841c7e6955f17708503f8df50b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18476,6 +19222,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18527,6 +19276,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18564,7 +19314,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b8bfc30fd671f90b206592495445aab722cfdcf9ca50fbdb827368dc568ae518", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "518e5d05b921772c198926e1383e98c2ab8dafa4b0173b073526573312ea3492", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18586,6 +19336,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18637,6 +19390,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18674,7 +19428,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "192a49bc69857bf20f9bf966c13605cb2babc404859c9b489580a5d569a37555", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "0f830486c564762184309c0d36a8e36cf6c12c8f7384810696dc01cbdf40ffbd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18696,6 +19450,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18747,6 +19504,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18784,7 +19542,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "353fc23a5f9a238892248e56206ebdee8b8cbc204ee5fd40c1fa2ec4c99e24b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f9adf6b437b59d8a3347f707e60adbbbc2e3475aef70980f09d20d09051f1cbd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18806,6 +19564,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18857,6 +19618,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -18894,7 +19656,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2d2bc053782651f9badca5e4ecb595fdf3ddfe7b23dc1f7db7d0048743a45924", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8b10910ec7ceba451d2d567f913d71110863abfc3b5d53ec7610ee25e6334bf9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -18916,6 +19678,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -18967,6 +19732,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19004,7 +19770,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "d0c20c11ac8368169a67070dead47052cc7eadf991d991adfe65414cf7360bc1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "02902c53f733cf9f6c9b9e1f988afe693c3dfd5327d32e481f192f2bde2f5732", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19026,6 +19792,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19077,6 +19846,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19114,7 +19884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b8f8f31372f7be6c7c7c2e49fd56ee1f24ab5a12ace2b76f876438328449486d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7a47b13c2597c668aee7cebb2a10d2ec5c49f056c770f026e061b9b431a2cf1c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19136,6 +19906,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19187,6 +19960,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19224,7 +19998,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2993b344ff566a137c5461d0ba0c1f7d57aae041c47be01290f24bb16e234fdc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 71120, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b0cc1927c836c87c79320d23d0b0baaa3c50b9592901c7247efd1a2995944bbc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19246,6 +20020,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19297,6 +20074,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19334,7 +20112,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "6c11434a040a6bc91cb9fac17dff32205ac1a3a18ad06d6fd2e28356ef759036", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "fb4e3364d9067b8b53ad119c336b14cabd6e457318868e6b1698dd647b72268d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19356,6 +20134,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19407,6 +20188,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19444,7 +20226,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "14ea1e97ec537058c447beacad93ac0d9bbf65e1d205322766361a74a9312869", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "8975f425328437a0080ec032d28652100bae2cda3b4490ed6727f593a3d4ebcd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19466,6 +20248,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19517,6 +20302,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19554,7 +20340,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199248, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "ed4e7e1a1b116d287fa9207ff252234c91c81f33324cc58dc8c3f3612c948ec4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199248, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "f65c15548e7898d023d2de9ad160e0b9a84da58037a22dafa4cb1cecd04017eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19576,6 +20362,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19627,6 +20416,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19664,7 +20454,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "77300db9a68a9aec3613cfd3d4961aa7dda60d7563e2b8a262fba50cf8aa5e0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "384bc216ac79bb982c9817dc262d24363c37f6fb5383a08683c4efd61a137d9d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19686,6 +20476,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19737,6 +20530,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19774,7 +20568,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199248, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "ffcc063d92df8828b993b3392859b0326302360b01b7ba33ec2be2c4b8d26e38", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199248, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "e8ca0def94839d7006d11c0e23250364d0f9bb0ba762fcb96c94a46972a3ab2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19796,6 +20590,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19847,6 +20644,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19884,7 +20682,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "2e4ecd933d833b9d8e6d61fba2c83ff27f922df33d90cf0a094775c8731607e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "0135f87b025375ec0de6db6a23afc80f8a153e9cc8158b02564cf5727c1218b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -19906,6 +20704,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -19957,6 +20758,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -19994,7 +20796,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223824, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "a4cc4366fe59ca1232e3ffb04b7dc207c4bdcf71b6302c73e893f72279b5c614", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223824, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "a1317c4898243ea77eb01f250acb3e0a877a1a506f89dada6e5f3ad373c55179", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20016,6 +20818,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20067,6 +20872,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20104,7 +20910,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "2f8e20fc285c12a0458f88de9ae470958171d2fc1de390b12d9a5ad6adc3608c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "d2dc4226d392905808a5a20f8b90015448c8effe6e41baa13a0ba0b48210f8e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20126,6 +20932,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20177,6 +20986,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20214,7 +21024,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223824, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "34a3922bcdf8a0cf7811e2454d17e5e481289e74cd812d9e42ad2c16a19cb2a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223824, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "2eb2b2175b8fe29ff14020560dfa9b8dff64650098eea4ddc913da86154e260b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20236,6 +21046,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20287,6 +21100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20324,7 +21138,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "77d61a3b2e987481ab08800db6d21257e01323930493edc5a93496a7a720c879", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "02bfd93040a63db63e6be75c2b6e14debbca7b03642c56f06f00a806f9dc8476", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20346,6 +21160,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20397,6 +21214,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20434,7 +21252,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222736, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "2b6f1d74d90c2a18fd203873cfb0af03b0a6a3bbde0f739f021cdd1efdd4c5f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222736, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "699837aafa51ae11803fdece15c5e6d7210f21c331cb274ee87244a88e2db28d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20456,6 +21274,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20507,6 +21328,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20544,7 +21366,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "577d1792d953e888813c2bfd4f8f3f39678a915835ef5598d02bf281dd57a336", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "fb06ae65cce73e41c4b7fe3f415acbf13710191249da0cbf478478d64e194b68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20566,6 +21388,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20617,6 +21442,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20654,7 +21480,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222736, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "19e4833d97258b5623a723003ccc67409990a521f8c426b865cfc9289750932f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222736, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "414cd1dccd06d196c446b348de7c0dfb052eef99dd73619f22489b7d4dcf80bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20676,6 +21502,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20727,6 +21556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20764,7 +21594,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "d674836603cbfe454fa4237b969e42e93633aae2e767f62280cc25dc505f1cbf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "7d69b625d2c2ad15d5ac928deb6704ca2706c230824928fe5f994b67f609132a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20786,6 +21616,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20837,6 +21670,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20874,7 +21708,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186960, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "15e4de3011dcef79a8bfbaf0604dc4792df9720eeef74161154f4471dad18309", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186960, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c4d99e7bd5a3e7f7ee66915a9898a28c9e5b6ddd5887b3078f5ba8eb0bf911af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -20896,6 +21730,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -20947,6 +21784,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -20984,7 +21822,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "6bb46252d0a738e1171369f39bd96d0493191fa48a248ad00fb9231a20106bda", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c7c445c453df9363e3eca88414aa7af2b74409001ded953b99447d4f2d49d217", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21006,6 +21844,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21057,6 +21898,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21094,7 +21936,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186960, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c66191e5e5fb15837ab9a38d8663723c186b84c0e6f8805ab4f9f44f9aadcf0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186960, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "4d1919a8c7877d5da4c731472be259ea48b3b6750a5c65f5bf379e9b51f43e79", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21116,6 +21958,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21167,6 +22012,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21204,7 +22050,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c5650b0406947e00d4f445e86b253cd04a5bb3e59d1397c51b353843a7742f63", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "60fc99c0936fc790b84070e8d521928f14c4dff48cf039c033915ac6a8b30e00", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21226,6 +22072,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21277,6 +22126,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21314,7 +22164,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222672, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "fd454f7bbb5c5a64003e719532c04482f9be185402404f0d5061ab0caf746cd3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222672, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "44ceaa574d3ca14b509f703a6a868914a63cbfdb92d1052d8764929335a35eba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21336,6 +22186,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21387,6 +22240,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21424,7 +22278,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "9b14764f69ba26d3131829a62a0bbb742f9c7fc0a960dab57bdceb187c7d211e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "1d77797d39ce0765e22a9cf1cb5dd24c488f809948cd57f3f296b4bf6d7b191f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21446,6 +22300,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21497,6 +22354,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21534,7 +22392,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222672, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "290a695a36aa8344c472f84d20aef2c34d737b8abf05ddb7067a9fc0ca930293", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222672, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "7522ff7575e025387c7c503f1f96c46d2645cc40b35bfa3deb79079582722337", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21556,6 +22414,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21607,6 +22468,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21644,7 +22506,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "7741ee5f0ce585b153997b62f0f2506010fe51b815b6da8156d081a8151ab575", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "40c353aca457bb2287792234ac71413b35142cafaf2603c946b3f07c2c9265bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21666,6 +22528,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21717,6 +22582,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21754,7 +22620,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203592, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "a6265f811277cf7e711f169e1da2eed48bef4120c7827a959e53d04fa049863f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203592, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "54350618f0faae5c3d22b9e446d179caf24280d49d9e4717befd4af63ad4275d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21776,6 +22642,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21827,6 +22696,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21864,7 +22734,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203592, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "442bd4b754327c7a1f83a7a08bf941f5bb27dd60072faa233d30d6fc77e6ca97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 203592, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "e04625e0c34e89a31bd8a802be005b73bbfd2360887e1227ceb81c0285d20a24", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21886,6 +22756,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -21937,6 +22810,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -21974,7 +22848,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 227928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "c3d4df8af897767383e196f049f1cbab688869975aa1cd9b19ca3e6844dbd96f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 227928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "478234f94a0daf5725bafd5fb8fecb3c8f5a0fa34a1a6b7d2bbcbf208228ae45", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -21996,6 +22870,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22047,6 +22924,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22084,7 +22962,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 227928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "a77253d4af0491a8fbf3d3145d1f53fdd7907e504f7f67948718ad52ab65c64b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 227928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 384, "9caad0b8bafdd1e46796157ee49136f30517c84bc20f4c1bc43de61b1bbc99d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22106,6 +22984,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22157,6 +23038,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22194,7 +23076,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199408, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "57094c6c7978ef63a4f8f3c1dab5ab1a3f07c237f534059a9ecfbd5bbb021856", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199408, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "579711409a3a3ce06432292e2b778cd9fd4011bc6a657d5b4d8107769e19a82e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22216,6 +23098,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22267,6 +23152,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22304,7 +23190,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "ec5875dd09de3e3dd32f32d8aefef4b4385f847a1769ddfeda3b4c97a0e6696f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a685f96d682362041b262e8aa22eeeba486d4f716a991ce3bd8064d2e267acae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22326,6 +23212,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22377,6 +23266,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22414,7 +23304,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225112, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "22121c621f005c4766040eae5dd5643e8b09e770a756f2626ade1f9486ff82da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225112, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6066e188cbf21fde0f4e3e121357b5ca729b687c80b67169a6c713ef52cc166c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22436,6 +23326,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22487,6 +23380,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22524,7 +23418,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2bc2308228ab5f6365027f0545c3e15066fa3771970b60fb2e1fc7a3775c5ea8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "acf8de8072dceee75ad043c7536aa12b45734837566432e1036e1bed0eb14f2a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22546,6 +23440,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22597,6 +23494,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22634,7 +23532,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199408, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "cc402d696239c853f29b251292aba3116b15cd410a780cffe08ff8d8933ce9ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199408, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "958f3f3739df3b1034fbf33b2ccfcc6621418ac37142eb7c2f2037211f4c23a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22656,6 +23554,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22707,6 +23608,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22744,7 +23646,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b4b7c7f326d4c25f577f158e29cdbef27d4570c394279288893dc4e438289ebc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "31412a05bd4543da02e2ebccfa778654c4edfc65fca452376814efdb6f86d20a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22766,6 +23668,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22817,6 +23722,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22854,7 +23760,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225112, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "9be6f96f443f094f55f2627cc69fd33caf6be5c29583e9161618244e8879e838", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225112, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6752830f0a41bcc3905eb567a30a3741d8ae6833eaf17677472eb502d19ba514", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22876,6 +23782,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -22927,6 +23836,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -22964,7 +23874,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "93c83ea56921432815eb08ae65be6552b10649ed48ce101f1521062e100dcdcd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d03c306279a89eac2ed5537db56b6e5e8acbe72d708ad6859e42fe03f38f2b10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -22986,6 +23896,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23037,6 +23950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23074,7 +23988,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len, 197272, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f", 384, "69880f0c911f857f11b7ae46295f335524c732f5d0284b3033b6e7556035f0ac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin_len, 197272, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f", 384, "00567f222ab71fa2c15d8a43308ee7b892e3be84dadf2ae60b78bedbe98e85d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23096,6 +24010,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23147,6 +24064,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 @@ -23184,7 +24102,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "659189d6f0db1de4e1b114b5c5ce7a774551e3e76ee89976a764bd62260addae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f9bcd9d11377483ae915075decfacd936e41c68fd1dcbd7dd7a6f999b8f5858f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23206,6 +24124,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23257,6 +24178,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23294,7 +24216,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "50fdd4c86c5984c52a5c17c5891ae4c2b5ff1ceb1862a6bc4b920242002f883c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f01afffea85ec098133c806cf7117218ea3eb8914911e80eaf39b4733a9e12ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23316,6 +24238,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23367,6 +24292,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23404,7 +24330,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204536, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "1bb31f80019369a73865a3049a89c513d17cd9d031a6ed5ee92cd9cd31322c65", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204536, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f203d37d346b80214c543f29f72340d9d8caff2c0b3dfa7021033018e49c7a41", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23426,6 +24352,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23477,6 +24406,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23514,7 +24444,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "fb801349440c41e798cd3c432460ed9fed52ee679e9beaf7dcb00a23f321085f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b846217462c09c953775c1505af225b71d793fc7f0e9bdd0e9ff45f96f482c32", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23536,6 +24466,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23587,6 +24520,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23624,7 +24558,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "be887dc0c11ba2bbbf3eb1a3f0a57868085b035decb26b29421c6fe7ba52f64b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 225008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "4a295791e5a8d5d3c396a4ce870096b3b0ee7c8b89cdefb763d06e704fa01444", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23646,6 +24580,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23697,6 +24634,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23734,7 +24672,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6386da50cd0030abdba6db42bb10015433b95e0c8d8968fb4012d4201b3e5b0f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "ab906008c1cedc720b812d118a766589ed51a75541b05c405890e26aa6ac138d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23756,6 +24694,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23807,6 +24748,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23844,7 +24786,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204536, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "0fe5360c8d7d616250743260297eb65d76b064e2b3c8db13b5295bdc4d337f61", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204536, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f9d2fe6f8dd4492770bafdf3875d841d2f53626827a5638ab279fd6193758e79", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23866,6 +24808,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -23917,6 +24862,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -23954,7 +24900,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "5af572e15e96313129cbdb568cbb69d6e2e152c6d2dbbdf9167fffcaaf8cdcd6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "220d21a909b8f5b58b7b1b2816bba44e309c3f3e25477a47a98a2df51861a17a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -23976,6 +24922,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24027,6 +24976,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24064,7 +25014,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171960, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c909e07a2bfe6b1804de36905d8c2d0c76dd9acd243e319d0b38e8cbd85d2bc7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171960, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "0f6126d30bb398c5f07bb085bd6ae79694d3b7e3a2136a8ab9e0dd891f26b7c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24086,6 +25036,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24137,6 +25090,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24174,7 +25128,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171960, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c54adc6836f9c645980e10b73e0356af6e2e6e920c8c189ad41cd9c23cf3c49e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 171960, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2cdc9fbc165bebffebaa0194ecc97c2d78d8ebf4a8523d0cc4e45833b313987d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24196,6 +25150,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24247,6 +25204,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24284,7 +25242,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 192152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "74d1860fad2300afe8731db7afdb95c852e4ea79ab21ee37fefe46fe1b72fb96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 192152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f121489ee66e414cc3641901c975fb8c096ed8e97e1bd3060fc27848e91f08c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24306,6 +25264,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24357,6 +25318,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24394,7 +25356,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 192152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3ac762d9a70b81abf4395f3091dc801d42dc75c1c0060870c6684a4f8b8636e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 192152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d608fb3ab512d1527cbca946e51503e36002d515daf5b7bfb71690f84ad54e66", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24416,6 +25378,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24467,6 +25432,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24504,7 +25470,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223056, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7b9db4adb37890b89ba1639ec22e6df51fdb43915381e15bd79e1475dc3c2f53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223056, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "17d3d212e31ffc56f5d94675aa8769c07c068acfb6e9f0a1cc4fb53761fd37a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24526,6 +25492,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24577,6 +25546,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24614,7 +25584,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "94937503dab460da4a098b93dbc0c7054b2df737aeafb7aea75fb0b2696d1608", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "5d0d39a0f6f590e24ac6cb8f170231fe7da2d53bc6494dc5a7535a5bb0a83ae0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24636,6 +25606,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24687,6 +25660,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24724,7 +25698,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229208, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7d6e5a4ed5c563e5a921019f2234571356b4a19ab6d5b60b35d42aaafedbbaaa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229208, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e47e3ac56c568a4dff51544305fb7710c44d722eaaa7613fae555644db8e1076", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24746,6 +25720,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24797,6 +25774,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24834,7 +25812,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b877bb92c361eff2a5c2b9182b888a849690730bbe1698edb8946f494ac86dff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "89a46e501afbfaff7c8f346cf84f237917eaf40e212e0c7a7a07b3858e3f6aff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24856,6 +25834,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -24907,6 +25888,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -24944,7 +25926,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223056, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "134c570b9a804543b350e45657716e1156b2747fbb6de2506b4cc642e37e34f0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 223056, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "7bd67d4fe727e8deccafb7b7090459e06945d803f27d974f91dd7d49f5148685", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -24966,6 +25948,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25017,6 +26002,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25054,7 +26040,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "8a070bdc6b463926897b3bf6960eeb4d63f6860b1ab4007ff2f8492313b962bb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "3f5a4c0d60200ef54b652717ae5dfe76592fd0dc803aa0576cd96f5e1ab09f6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25076,6 +26062,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25127,6 +26116,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25164,7 +26154,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229208, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "bde30a55f99ddbc1cdcf0a73be0a7dd016c49a20e51b31e605758ab1326cb049", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 229208, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "0e3be39fb462d648561680a9c789c3c234bd1f780eae7166df3d037373f4bfe7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25186,6 +26176,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25237,6 +26230,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25274,7 +26268,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "af241a4e0cf5f2a85e03bd1ed2b8d72798f12b388a035082babdad65bc33167b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b6d6800ca5931d6f35a7bc573bf817645ef655476c45ac29b6f8b3d60f964eed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25296,6 +26290,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25347,6 +26344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25384,7 +26382,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "f400cd5784830fd1c99ac76b920970cfde640c8d2cdb495cfeab2072cd6baa0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "2cdaa14c0e4745e0a3a7a6ebed48c6b54049cdf6c5b22302703e6a4f60e34d2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25406,6 +26404,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25457,6 +26458,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25494,7 +26496,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "225dd7ad71b9504dd0687cd4d8bb61fdc357903b43abc90382281f9f9f3aaaaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "6f1023733b81ca69a279d1648a19bdc842a9f6e604151c9a2293db23865cbfaa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25516,6 +26518,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25567,6 +26572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25604,7 +26610,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 228920, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "99a1871208c492c86af39d1e9e529b6b9748ab76e84af5d99f98acd8d7ec25f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 228920, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "b6955ccdbaa1a1d00e0e3a16f4d91319959be1d2cd2df73947f9779e8bfa60fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25626,6 +26632,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25677,6 +26686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25714,7 +26724,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "5175e7555a16dd56edb47f92b94e4c2bb9d740bc4b5f05c26256a3c509b40764", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "39bc011f37dd392c41668b79956951d3ff563256943e0f7aa442a2ab4cf84a6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25736,6 +26746,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25787,6 +26800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25824,7 +26838,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "31e87d2eed3a20db57d59b33b9ef8ae255600d6b8f9eaf24d871d791a3ac5294", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "a0d7e3075c1531ca381b808abdd7cb878328efa37552df7e6b06bc1ba281e21f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25846,6 +26860,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -25897,6 +26914,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -25934,7 +26952,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "e670f58b311cdaaa7592b8e63a9fbfeeda8d8d06b47306ae2d89bf7d7a1856bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "56296de38781ef502302e77cc25acf95a477834aabfd1958f57dc9dd810bf2b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -25956,6 +26974,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26007,6 +27028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26044,7 +27066,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 228920, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "c8bc080df22c1bd8f4aa3be0d67c3f2d9276c0206b83b9c5e3ee69c504f987c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 228920, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "40a9aa30b938e93c28f0c1c60cc36bd9cba0379298d67cb853fdb492fc178769", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26066,6 +27088,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26117,6 +27142,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26154,7 +27180,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "d6ef4b0b584f97db292f0402c2b8361f8322aab748368528c8a35674f2e3fde7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f", 512, "0e8f6cf29bb158e2312b08dc14608160ccf3da6696683700e27acf18806f3e57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26176,6 +27202,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26227,6 +27256,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26264,7 +27294,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "e876fd6bff77426c2bcffcb869c905b82d8c2ed91f390939a6a84619fac083d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "dbae25fac979f97bd8352155c5c3455f6ded7fb85ac9d419ca51ac48a7ca9652", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26286,6 +27316,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26337,6 +27370,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26374,7 +27408,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "7b5a67c43e37b42676ff0f83af8d111dbeff28c73bed0d8db5fc0aaad9e342b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "595882189d311d4aa949925ba4d9d2460c5fa98d1340ceb86e26aac2b944f911", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26396,6 +27430,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26447,6 +27484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26484,7 +27522,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "48a20560df4495e092fc341d2c311f6a1d1426b2cd07da90bede5bd11f649ae0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "1d1c2624184081e5324051d28b7587760da0673e646d677bd77de72232855010", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26506,6 +27544,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26557,6 +27598,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26594,7 +27636,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "1ac07f4b804077539769887308c557067d1cb9daa4e88408ef227ba0643e0836", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "6a52a8f3613dbe282caade001b1ce1c6aa5af822fcb16b07e80f130e175a7509", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26608,7 +27650,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26616,6 +27658,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26624,7 +27669,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -26667,6 +27712,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26675,18 +27721,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -26704,7 +27750,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "41295b9b8f4feef92fec4f0f100cf0b0de14d654d2f40090d0fab96ceb07ad18", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "644358aadca5078509f067e9f57b95b3ff10dd69b72b4fe08d2d17d083a79027", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26726,6 +27772,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26777,6 +27826,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26790,7 +27840,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -26814,7 +27864,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "0b08531adaec9d25db77f00fd3e090255e9f7d46e2b5a6e3b1be2cb28aa04d4c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c91c954befe6fc7d0a459bea5d79802a86e29d577cc0430bb611f975a1f619dd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -26828,7 +27878,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26836,6 +27886,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26887,6 +27940,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -26900,13 +27954,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -26924,11 +27978,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "8727469f146050dcea1fd2e1ac03cf8a13e2e864d1384a05072c4b40f37db95d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "ec7c2a253d27752f7767090709954d92b0e0d60746e56e144ec5010bbd2431d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -26938,14 +27992,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -26954,15 +28011,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -26970,10 +28027,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -26985,18 +28042,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 128 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27005,18 +28063,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -27034,11 +28092,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "2b72f37e923c549f3a7f9789aaef29d21b4ac0ed2d60272f4b598ede2ca95cc5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 180984, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "d03f852b0e1c97fc122b92fe07b97dd08c044750046ea9f1a1798fcb14062156", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -27048,14 +28106,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27064,15 +28125,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -27080,10 +28141,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -27095,18 +28156,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 128 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27115,18 +28177,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -27144,7 +28206,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "5a74150da09a0d526cdd8221f5dc0d8c196f15761051154b0200022e7c9fb361", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "7d962f8e562fcceec40e8bf1c6c8be22512dd90306be9f0192e73b04f8ff73c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27158,7 +28220,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27166,6 +28228,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27217,6 +28282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27230,13 +28296,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -27254,7 +28320,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "3ad91617f6723b393caf579a0f22e2811a517b218fcf5208632cecb76d50d418", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "430def109a944b6574bcba2136dc219890320d96f75d40880acbae93f7b4be7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27276,6 +28342,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27304,9 +28373,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27323,10 +28392,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27340,7 +28410,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -27364,7 +28434,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "f90a6cacbfe3014da5019eeb78d0e6a84b9ebfd0c2923ae6c3f9ca27bc28a029", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "776c40e2c1d6a9f0d4e14abbb1148f7e07403ddccfa78667b2f9d2a6a44c9ee0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27378,7 +28448,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27386,6 +28456,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27414,9 +28487,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27433,10 +28506,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27456,7 +28530,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -27474,7 +28548,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "d23a31d10dcb22d7b74d56844ed884da9001ba59d2e8bbdde1230969eecb9868", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "18793283463ba18e691ffc8658d58cdab69ee5d0b88ec325e1f8c5f95482a4ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27488,7 +28562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27496,6 +28570,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27524,9 +28601,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27543,10 +28620,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27584,7 +28662,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "e678ccbcb93c5edd156afd626a31a586ea2fb27956f1566ff8c5c4d92d65c6e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "d2f6278d497d434118a8eddcc65aea55f2cf54c157d50dbb43f8e5fe53c62423", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27606,6 +28684,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27614,7 +28695,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -27634,9 +28715,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27653,10 +28734,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27665,10 +28747,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -27694,7 +28776,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "00cf1a66a807a1e74fa7539b297fcb77a44e41b6b7474ddd10819a5ccd9d5d85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "d7d81a836e92f7b4999981a696c50794c4f247f575f1081d779042bbe3270135", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27716,6 +28798,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27724,7 +28809,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -27744,9 +28829,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -27763,10 +28848,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27775,10 +28861,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -27804,7 +28890,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "aca3b9bd44ab0a955bb71a2154546fcdf158fc356feadf0c9e3db676df519b04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "74afcf62d70577619656adadc45f79e5fbc9d104d8872c595d00e16ace2a33da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -27826,6 +28912,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -27834,117 +28923,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 512 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "ea5533ff75fef077157e38129dade065253062e7c27527413c9c8731cac10832", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -27987,6 +28966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -27995,18 +28975,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -28024,7 +29004,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "65076d360152b3233c304c85422125dee0e9d4e6c82af87a032f22c7d8086466", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "7113ed96c5fb373243e6ca45eacaa5fc4ae4824990912f4671d5252ea7a5b9e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -28038,7 +29018,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28046,6 +29026,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28054,7 +29037,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -28097,6 +29080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28105,10 +29089,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -28116,7 +29100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -28134,7 +29118,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "4e6cfd41ed9a5258da4d31c15945ffe9f641be024d8cb0e9f41ae95afc1e54ac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "5a682227d42094c71128e316628d4562cc9a45a2c76dd5b77f98e26337920e42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -28148,7 +29132,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28156,6 +29140,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28184,9 +29171,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -28203,10 +29190,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28220,13 +29208,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -28244,13 +29232,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "5803464557f19440989556285a7193ad58e4d0f3ae128e08e8ad3b8ee9b2b3d5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "9265ebbed68f4ba733007e26b9f96cdcc95d4688907db7cd5568276b866f0b3d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 3 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -28266,6 +29254,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28274,7 +29265,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1536 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -28291,9 +29282,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -28309,14 +29300,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28325,12 +29317,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1536 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -28351,16 +29343,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "ebcbfbe792c15d1a65716f73884edea8deedbb2237ad55b6260e337d31900683", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "e1fd8debd3d4df9a7d5fb39e7cfe8cc47c1ce915de4d5a8ce3706709ca760d99", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 3 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -28376,6 +29368,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28384,7 +29379,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1536 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -28401,9 +29396,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -28419,14 +29414,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28435,12 +29431,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1536 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -28461,126 +29457,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "1d755d8dc22edbeb4cac3c215c0fa239764d8dcbfd97c411bbc3bb340f03f9cf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 4 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 2048 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "b5857e6123971e2abd4a43eb000969fad26c32d9ed3c4a97e5ce9685fecd7e68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193648, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "145a2cb97b4d95e9d1eeee9e9facee736ac863ede4e1d2a3e4dda6d4e16f2b4e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -28588,7 +29474,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28596,6 +29482,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28604,7 +29493,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -28621,9 +29510,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -28639,14 +29528,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28655,12 +29545,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 2048 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -28681,14 +29571,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "563a484966fa0a61e1663d451c42b6f8ff26ad06eeed4ea429bd2f46d23c4222", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "acaa3400a66e22eb1ef4854d6a3a6e7647dac46b324245bd04e3ad61f805c11c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -28706,6 +29596,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28721,7 +29614,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -28733,10 +29626,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -28750,13 +29643,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28765,7 +29659,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -28791,14 +29685,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "701d134fa408710e3456c6c2ae962d10966775dde8ca22a70fb70fcba6d7b982", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "02ac5244ac22d85dbc88792498b0f096fb02ac77285e6f3d968c4216a4686754", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -28816,6 +29710,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28831,7 +29728,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -28843,10 +29740,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -28860,13 +29757,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28875,7 +29773,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -28901,14 +29799,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "d0cdfd1fb7b214e16e7736a2fa990a8bf424445877e33ab88340e02775c1707a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "7e4aadcfc3cbcf23bb6b82209ed203f1c180cf442639d48f322c780ba5fc2a04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -28926,6 +29824,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -28941,7 +29842,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -28953,10 +29854,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -28970,13 +29871,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -28985,7 +29887,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -29011,14 +29913,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "720f237fad7a97489b65e57c35e03ed401f87f468e16bb440e5b8e7f9e79f24f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "e28d864a6fa25d144ec703cbbf0319a33fd39d89b1245b5248c988e1bdbfdabb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -29028,7 +29930,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29036,6 +29938,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29051,7 +29956,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29063,7 +29968,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -29080,13 +29985,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29095,18 +30001,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -29121,16 +30027,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "d043d45d598685216f8ba1d65a8f12b017fe802e9cf4292c8de449f8fffaf6e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "b4be285b999991d6d4d32699f8deecc16df70ba618369decb532de8c92050c31", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 3 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29146,6 +30052,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29154,14 +30063,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1536 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29171,12 +30080,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 3 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -29189,14 +30098,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29208,9 +30118,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1536 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29234,13 +30144,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "0f03b6e2fdf9f246afc442e23ffbc811077616f0f4837957c224ae8c43c252bb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "cd9996919ece26df7ac5f916fa237b51c7c6da077fe8e7add89811da6f05a761", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 3 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29256,6 +30166,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29264,14 +30177,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1536 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29281,12 +30194,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 3 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -29299,14 +30212,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29318,9 +30232,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1536 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29344,13 +30258,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "3788c76d527fff018836a0373b6f64fe037b912f8ffb8de54621e33f6d9cc6b5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 195224, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "38d9f43d6c0be692e2216f7e9597f47a4e0d60670a204c9bc97bf32c7412016e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 3 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29358,7 +30272,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29366,6 +30280,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29374,14 +30291,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 1536 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29391,9 +30308,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 3 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -29409,7 +30326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 @@ -29417,6 +30334,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29425,18 +30343,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 1536 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -29454,13 +30372,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "93194a94fd4877926edf979df143597a49505ebbccc62a9cdf147b3911040027", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "40184a1075a33631816db5dddb072bde690760179f7983623073ff8c57f64d99", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 4 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29476,6 +30394,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29484,14 +30405,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 2048 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29501,9 +30422,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -29519,7 +30440,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 @@ -29527,6 +30448,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29535,12 +30457,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 2048 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29564,13 +30486,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "8df595bf5ad078ba904e84b00d4077b606d97338bfc4fa29d9840c40854ed256", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "44ee404a49cb94ecb5636088319773b4d755f12bf6f7c0742f96cb36dcdcd163", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 4 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29586,6 +30508,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29594,14 +30519,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 2048 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29611,9 +30536,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -29629,7 +30554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 @@ -29637,6 +30562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29645,12 +30571,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 2048 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29674,13 +30600,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "5f69f8090805174cd4e4afd1bd505a3b28bce2efc4a46b6e130fb58f8ac09351", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203416, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "af582ab7cb04999b863fd857f134f369dd4f36c121d5f92ce485e508e82e8f45", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 4 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -29688,7 +30614,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29696,6 +30622,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29704,14 +30633,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 2048 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -29721,12 +30650,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -29739,14 +30668,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29755,18 +30685,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 2048 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -29784,7 +30714,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "ee3439523963bbc9682bc4affaf5a05f29288dcdb8c5692c9807f236ab6b8de7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "8b46aa771c2ebc3fd53b089599638b032505e8a8947a9f9570ddb124b407fb6c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -29806,6 +30736,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -29814,7 +30747,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -29834,9 +30767,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -29853,10 +30786,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 512 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -29865,12 +30799,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -29894,117 +30828,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "c9c42a321cba7ed4145e0ec0cd7aae8be915ebd341f0efdb216e47c45d0e66d6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 1024 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f", 768, "356b1bf529ab39b95a2a60de1aed74a700cbe71346e1896ada0dbf6fd08f7f7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "e2f7cfddb9cbaecb6614657613f0f2df0f4c5578b7e0658402ac94abd735e25f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -30025,8 +30849,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 1 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -30034,7 +30861,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -30042,21 +30869,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 144 -, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30065,20 +30892,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 1 +, /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -30088,9 +30916,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -30102,19 +30930,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 32 +, /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 40 +, /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 4 +, /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "5e708873c4607cfff47cc6ca7a4ba33d6262c8b87e19e174aa98b9a0f8036ba6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "972b0e7cab5462fbf7aad5cf70d7dc23ec1655964b8bbe064944428c1e194d5e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -30128,15 +30956,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 1 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -30144,7 +30975,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -30152,21 +30983,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 144 -, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30175,20 +31006,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 1 +, /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -30198,7 +31030,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -30206,25 +31038,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 32 +, /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 40 +, /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 4 +, /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f", 768, "645be6164d33eda1c00b15120eee3371fc68af37b3b7a2250dab1c4935c71077", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "c2259e360d1651f581002ba46d6e76a0b751ac0a3b38deab925a58d6bbabb122", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -30238,15 +31070,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mFuseUtccpWithUtcmma */ 1 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -30254,7 +31089,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -30262,21 +31097,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 144 -, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30285,20 +31120,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 1 +, /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -30308,9 +31144,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -30322,23 +31158,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 32 +, /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 40 +, /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 4 +, /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "7df56cd9ceba03381de7337a973bdec858c7edd23ce3ba897d001efc1d9bc47f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "4176e58ae58240ab99fc026cefdbc61e5678fadb6ff3df42d68345aa1e848039", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30355,7 +31191,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30364,15 +31203,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30383,10 +31222,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30400,13 +31239,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30418,7 +31258,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -30441,14 +31281,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "445e22fc5aed3e74f2afd2c913aac42809360d7d272269452aa0b72c4a8a0338", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "4822e26ba37df0e6e9d9b37c279d291face06f92575aa1dc89be984a03ae52fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30465,7 +31305,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30474,15 +31317,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30493,10 +31336,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30510,13 +31353,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30528,7 +31372,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -30551,14 +31395,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "2bddd6538f9a80ff215903a220015e27cc154e3017aa71dfa6e2727111badbd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "7e84404342220ee4f79ff40b778cffa79091baf9d5ae0475108813d2df6e0803", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30575,7 +31419,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30584,15 +31431,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30603,10 +31450,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30620,13 +31467,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30638,7 +31486,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -30661,14 +31509,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "0684c13b7797a3ff3b43e75152fd7087526bff87a686b21db68f836ff20532a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "e11bef5c553f5b1a3f773f78eadadce63744e269ad4e577e3e90094f704528ca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30678,14 +31526,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30694,15 +31545,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30713,7 +31564,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -30730,13 +31581,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30748,7 +31600,121 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "feda64d64314aeeaf5a65573214c88e10292e65af8250c059f4c6e9286ca2f9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -30771,14 +31737,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "6c6543405ddd91a63005332a83c6170f52d300879c5465aeb7b136f0944528e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "eb675c730c86a205aed8baf11717eb87ab18dca100f44fff0bac729e4eef923f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30795,7 +31761,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -30804,15 +31773,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -30823,10 +31792,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -30840,13 +31809,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -30855,10 +31825,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -30881,14 +31851,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "10a5cd60a3c7b3e42cd6b108fb8b3a60859af9bc19f36b1897edfb3b74325a6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "c04f048b76611b2b74813d8f611e9723a726408a736ae1df0524fa0655ab2ca0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -30905,117 +31875,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "5886741e6a30f896a3965b479ecaa79c3332e65f360e7031a4896e4186bc57b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31024,15 +31887,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31043,7 +31906,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -31060,13 +31923,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31078,15 +31942,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -31101,14 +31965,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "02445c49e827cac872950069e0ffb4d6eaa83b950230220da8dbc1d95557e863", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203512, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "ce9f9cc3dc51379712065de08e5c54a0bceecc17e5c989736c0cf6a71213a512", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31118,14 +31982,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31134,15 +32001,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31153,7 +32020,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -31170,13 +32037,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31188,7 +32056,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -31196,7 +32064,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -31211,14 +32079,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "3d232bbcc8c4f1c6497d77a81d93991864fa9079b993a8b2653c16a91027ba11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "1f7cee65ffe3f8f5f54c69040b09c3fbc3df323be7c8436dd9161f8fa520a790", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31228,14 +32096,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31244,15 +32115,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31263,10 +32134,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -31280,13 +32151,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31298,15 +32170,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -31321,14 +32193,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "fc0a94f1ed97f450f07f429c0ecadccd8eeb9683aeba735fe8c59941f018050b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "7dfa57cf56d9412230731b8914c86a4709a47cef116d478064f7547427992697", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31345,7 +32217,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31354,15 +32229,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31373,7 +32248,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -31390,13 +32265,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31408,9 +32284,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -31431,14 +32307,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "887257c3011ac976d8ad3d9f628389054f379b23e73dcc26b31d8ad721015662", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "7eb6510220f41643668c8efef6b9c5e7b67004da77db4d9256ba9426a2137162", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31448,14 +32324,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31464,15 +32343,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31483,7 +32362,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -31500,13 +32379,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31518,7 +32398,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -31526,7 +32406,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -31541,14 +32421,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "04a9666a04a2a4b8d2d171c9ae6ae88992d100de13223811deb29df20d82ae75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "0e4ca48ddee507afd0058e8e8bd831dc8dc265933278a6b99dbbebccff10d13c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -31558,14 +32438,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -31574,15 +32457,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -31593,7 +32476,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -31610,13 +32493,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -31628,7 +32512,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -31651,16 +32535,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "34b34c7e43159332d42a5226a1230bd1288b84974f02dcfe2bacd73d854b30b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f", 768, "0895a763d11e75bea4d2ccbff9a8e8f6302ab45b6251ca4a7af4645f05ff50e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -31675,8 +32559,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -31684,29 +32571,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 2 +, /* mNumRegsPerThreadEpilogueWarp */ 144 +, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -31715,20 +32602,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 +, /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -31738,7 +32626,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -31752,25 +32640,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 40 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 +, /* mNumWarpsLoadSfB */ 4 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "9082649b567a8bc5d7e45e1a749124f3e2f7721f90c455a611277f5ea1d3cdb3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "53afae170050089434427fbce50c144b103bd901a00c8a7e35291c4b214343ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -31778,15 +32666,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -31794,29 +32685,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 2 +, /* mNumRegsPerThreadEpilogueWarp */ 144 +, /* mNumRegsPerThreadNonEpilogueWarp */ 88 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -31825,20 +32716,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 +, /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -31848,33 +32740,33 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 40 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 +, /* mNumWarpsLoadSfB */ 4 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "a2f064856d61dfc4f8e31e8a2c221b39e2c940367d08844a1d8b166f1fb454e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f", 768, "2944fce32fc866c4fcd16255497199002bbee5da15b5683ecf095738c339b6a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -31888,15 +32780,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -31904,7 +32799,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -31912,21 +32807,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 32 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 144 +, /* mNumRegsPerThreadNonEpilogueWarp */ 88 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -31935,20 +32830,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 +, /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -31958,33 +32854,33 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 40 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 +, /* mNumWarpsLoadSfB */ 4 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "60ee6d0bd039df49844844f5ffbed2951bd8aba456a875991951368c898cd651", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin_len, 172616, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f", 768, "acff009c3662793b3c92e9bc4c4ca9a654150331d010fa6fdec28e396be749ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -31998,15 +32894,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mFuseUtccpWithUtcmma */ 0 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 @@ -32014,7 +32913,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -32022,21 +32921,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 32 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 144 +, /* mNumRegsPerThreadNonEpilogueWarp */ 88 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32045,20 +32944,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 +, /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 , /* mUsePerTokenSfB */ 0 , /* mUseShuffledMatrix */ 1 @@ -32068,37 +32968,37 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumRegsPerThreadLoadSfB */ 40 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 +, /* mNumWarpsLoadSfB */ 4 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "dca4b39f00c5e17f8e150f884fb42527d6ae96214efb269f30437d9b581dc010", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "219b8f837bb5ec2d7560019a5bb2c176636bbde1cea5449160ba3afe3d976924", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32108,7 +33008,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32116,6 +33016,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32124,14 +33027,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32143,7 +33046,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -32160,13 +33063,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32178,15 +33082,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -32201,14 +33105,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "f6057febca6ef33a96802a148e2f812ca24e95963ab9b18e3658717302807208", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "0fa3cc177e015687be27c13845ec9eda43b3182d9854ffe523ce01c2284b4181", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32226,6 +33130,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32234,14 +33141,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32253,10 +33160,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32270,13 +33177,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32288,9 +33196,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -32311,14 +33219,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 768, "36096626550738537780d40d5a447476eb1b8a4546ab916dca41ff6b912b6255", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "6d13a81aec4f8f4c3067110ff6cddc42d569bcaa2c1d1657d091c7ba178d02d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32328,7 +33236,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32336,6 +33244,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32344,14 +33255,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32363,10 +33274,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32380,13 +33291,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32398,7 +33310,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -32406,7 +33318,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -32421,14 +33333,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "4959925c55ded58259339af5a26f3a370b909173a769298cef72f106838e762c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "d86c2c20b1ebc8f2d34a0970fe83031fe4c621b271eb17cbc06367fbe63d449a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32438,7 +33350,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32446,6 +33358,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32454,14 +33369,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32473,10 +33388,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32490,13 +33405,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32508,7 +33424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -32531,14 +33447,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "f00d48938b5982ed541c47333c9a85c164f6ad51a16e04be30d0d0af6c383120", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "ec55d7175b30dbfc4fd572ec8ed34a6c33d9bf558d0c22995354125dd8f2a4b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32556,6 +33472,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32564,14 +33483,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32583,10 +33502,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32600,13 +33519,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32615,10 +33535,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -32641,14 +33561,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "6fd57fa070371c6e843f38d501ddddb631f5f9a6a7b68984ac82c6be5dce8d86", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "187fd126724246405e3ed91d3b7153578f9ae25009d8a1b38e6b3d9e3f52263d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32666,6 +33586,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32674,14 +33597,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32693,10 +33616,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32710,13 +33633,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32725,10 +33649,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -32751,14 +33675,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "81d38f2f3afc955c18082ef9458b504f2c8f7cb9a174be691f8a13c69736ec51", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "045d6aa32c1f04c1f51652b87664ba516d4cedcd54b2ac819b07d56b6f349cbb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32776,6 +33700,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32784,14 +33711,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32803,10 +33730,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -32820,13 +33747,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32835,10 +33763,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -32861,14 +33789,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "fe2d8d92492a06da698a1237163bb63c5d2d7f13b60e03bc53d57a925faac7d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "a2104851aff21b5f9ecd95bb38ac3df8fe234320063ca3c629943b9a87f5f9ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32878,7 +33806,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32886,6 +33814,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -32894,14 +33825,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -32913,7 +33844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -32930,13 +33861,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -32945,18 +33877,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -32971,14 +33903,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 768, "9ef8e1b2b6e665b5bb04ea71ef5719885c32b25c85cccfb94f249754d93bccbb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "86ed86a3dbedc37e898e14254352b61815566d632866be67eb6e9c8a248b7e73", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -32996,6 +33928,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33004,14 +33939,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -33023,10 +33958,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33040,13 +33975,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33058,9 +33994,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -33081,14 +34017,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "5d55899b488f30a2f2383fa157057205bf6cbeb26e8d705c92ecc702ec46e9f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "574ac6bcac81b2b03cd658deaefcf5bb5f02099a707dc08fd671b60f35217ba4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33098,7 +34034,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33106,6 +34042,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33114,14 +34053,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -33133,10 +34072,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33150,13 +34089,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33168,7 +34108,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -33176,7 +34116,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33191,14 +34131,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "d7845ba0e9479c3d8dd1ef0621df6ede366c1a8e7c00a01d8517225f8e1acd3d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "7f9166006cce5a422f87a5f1e4f8ea924bccadf8dea14fbc22ec506283ebbb56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33208,14 +34148,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33224,15 +34167,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33240,10 +34183,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -33262,11 +34205,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33275,18 +34219,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33304,11 +34248,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "da13cb1ea87c963709ce9d282a7a2ff9e69641f5554bf6cfe78622ed900b89f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 215152, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "1272cfa7ed90576061c0e7d6cd38f7d9a49a501bb9e9a1692698d81abededbb2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33318,14 +34262,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33334,15 +34281,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33350,10 +34297,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -33372,11 +34319,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33385,10 +34333,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -33396,7 +34344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33414,11 +34362,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "6691468120670e20e4617c444989aba4cbfc244b81cd14656c462532a79cae3d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "b5aee42a14b2172eb4c23001bf8f3f34077ec9b85d490dcb7d8a7e3f6ed20b62", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33428,14 +34376,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33444,15 +34395,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33460,13 +34411,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33482,11 +34433,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33495,18 +34447,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33524,11 +34476,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "32330d23dc770871dd08be727d555736d9dd3fccce7f55c0818353f4e3ea5e5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "8a7c3a8bc49fe8a6642e7c18cf458c567e753051e03c06aa18080b774415c005", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33545,7 +34497,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33561,8 +34516,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33570,13 +34525,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33592,11 +34547,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33610,7 +34566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -33634,11 +34590,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "961b4c0931a18742cddb17f25d90bb401c4064ca83df1493a9e784da2cb3371c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "1517b3f2c06e4d709d834aed3fa608dc23deca0b7a61194577365f9d63223352", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33648,14 +34604,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33671,8 +34630,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33680,13 +34639,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33702,11 +34661,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33726,7 +34686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -33744,11 +34704,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "06be952c3539e25ea13c8a746e40d64e8bfb181c392a512a5abf871c6195bc7b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "2b206069d79dcbc4df528be693d9782709b79796cd5f703029b5a8247b571d44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -33758,14 +34718,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33781,8 +34744,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33790,13 +34753,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -33812,11 +34775,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33830,7 +34794,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -33854,13 +34818,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "356a82d5abfee0c90a5d660feff497e8675542ed6bfcc15908405a3a996d70f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "616b3943d20160b6edb4c9bd8830cb8b41cf71b0254f93337ab53f816cc06772", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -33876,6 +34840,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33884,15 +34851,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -33900,10 +34867,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 -, /* mNumSlicesForSplitK */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 3 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -33919,14 +34886,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -33938,7 +34906,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -33964,13 +34932,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "22f9918d86e166fb349e1876bda7dd15b691cd6495a12662c1d702790ddafdd1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "b7d58686994de72eee9584e261afd79b6fed6769f839f1ad82e3869739532b80", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -33978,7 +34946,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33986,6 +34954,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -33994,15 +34965,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34010,10 +34981,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 -, /* mNumSlicesForSplitK */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 3 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34029,14 +35000,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34048,15 +35020,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -34074,13 +35046,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "a62bb1623b2d3120c1e06c3c909ea9153c699bee445a92e8bd27c44f0a0b6e21", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 175672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "f6c5ddda331b99621220bb18ec0e7d2bc5dc1a4e873f0bf7f46a470a99ea08fb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 +, /* mClusterDimZ */ 2 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -34088,7 +35060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34096,6 +35068,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34104,15 +35079,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34120,10 +35095,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 -, /* mNumSlicesForSplitK */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 3 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34139,14 +35114,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) +, /* mSplitK */ gemm::SplitK(2) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34158,7 +35134,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 @@ -34184,7 +35160,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "0b1a6cbaeb5be440d583f0d124b6623bc3d8fb51c9a5bdd2aac844761ece958d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "66bb64327c0c319f7eaee2cc5e8659be20b3972903755b401560d3f5abea999b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -34206,6 +35182,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34214,7 +35193,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -34222,7 +35201,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34230,10 +35209,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34252,11 +35231,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34265,10 +35245,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -34294,7 +35274,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "7c84b57014024bba8edb326d04a298be4989de3c1794d419f9def56c16c1b4ca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "262a9ab729165e696cb303487354a10319038fcfb8f72389ddf7236272e51126", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -34316,6 +35296,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34324,7 +35307,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -34332,7 +35315,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34340,10 +35323,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34362,11 +35345,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34375,10 +35359,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -34404,7 +35388,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "1fea6456b6265f121a62fa9c3a55e70d2476e95ec0502a812dcc1de0447deadc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "01413a3988941fb06742181419b7a92f28d82da6f9e63e17d11b161c2442cd0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -34426,6 +35410,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34434,7 +35421,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -34442,7 +35429,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34450,10 +35437,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34472,11 +35459,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34485,12 +35473,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -34514,11 +35502,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "1279a627948ce7292ca4e1bb78edef63b388a524e72d97ddb5f1b3e5b558ff2e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "60aede6214b20ba266822d4f49341e78c4210df382ba8d7beedfdb24b797d518", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -34528,14 +35516,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34544,15 +35535,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34563,7 +35554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -34580,13 +35571,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34598,7 +35590,121 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "700f61041637bef66d9ef03bad48129ef93d645a766d415d3e2fd1a2ccb54e2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -34621,14 +35727,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c8ff8fae8c41ca5b68e8fc13d611134c4aa8460c5eec1fdad0ca93dd517105df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 768, "923b8540a0ee23c930c310a1d77188c8307c3d43931eb9069069c7c4e77c59eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -34645,7 +35751,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34654,15 +35763,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34673,10 +35782,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -34690,13 +35799,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34708,7 +35818,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -34731,14 +35841,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "6ee2de94bb9f050044800cbf29a7d4a97380762d5d8972315eba04efecc1b9c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "e6a6393f8b603f5ff7c562f8009781923f5192aa2737af2acfb8fca5a5d42719", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -34755,117 +35865,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mFuseUtccpWithUtcmma */ 0 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumEpilogueWarps */ 4 -, /* mNumRegsCastAWarps */ 0 -, /* mNumRegsCopySfLdsSttm */ 0 -, /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mSfBlockSizeA */ 16 -, /* mSfBlockSizeB */ 16 -, /* mSfBlockSizeC */ 16 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mSliceK */ 0 -, /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mTransposeMmaOutput */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseMaxTmemOverlap */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mValidM */ 256 -, /* mValidN */ 256 -, /* mValidK */ 256 -, /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 -, /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 -, /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 -, /* mNumRegsPerThreadLoadA */ 0 -, /* mNumRegsPerThreadLoadB */ 0 -, /* mNumRegsPerThreadLoadSfA */ 0 -, /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 -, /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 -, /* mNumWarpsLoadSfA */ 0 -, /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} -, /* mUseTmaOobOpt */ 1 - }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "397bbce8d0d69c73e416953ddc88c0adae0160acca87df7e17ea86fa1d754a62", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34874,15 +35877,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -34893,7 +35896,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -34910,13 +35913,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -34928,15 +35932,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -34951,14 +35955,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "1fa4ab6dcfe6511262b6413c12d44fb347bff22c6a9769737ad98cb5aec7b600", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "06438f710c38d0ad849f8730e23414de709b39249009aaeec87ff49a85232144", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -34968,14 +35972,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -34984,15 +35991,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35003,7 +36010,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35020,13 +36027,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35038,7 +36046,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35046,7 +36054,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -35061,14 +36069,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "0be38a345e02af78b6dae6751934a619a6435e8b6285c909975aeb894133e84e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "afe41d5236e744ed50d017d9dbceed57dbd1019ee52d2ba0e4f09eeab5edce5f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35078,14 +36086,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35094,15 +36105,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35113,10 +36124,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -35130,13 +36141,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35145,18 +36157,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -35171,14 +36183,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "0132d288d39de8583523ccf8cd6f41bf566bf2296de686db988baf0945b63dc9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "7f9c6e7a2ae95c39d68a89c3b9c6f224a1ec9a241b12546e8bd646634b40c786", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35195,7 +36207,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35204,15 +36219,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35223,7 +36238,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -35240,13 +36255,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35258,9 +36274,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -35281,14 +36297,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a6b8f6fe6b225c51e4486af480b8c00e91e3e2cab65cb8f6ad4568cbea52cde3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "dbd271f4b2faef9f3b45ad71dcf0693acbded3e5273f716cec4de7366b954962", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35298,14 +36314,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35314,15 +36333,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35333,7 +36352,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -35350,13 +36369,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35368,7 +36388,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35376,7 +36396,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -35391,14 +36411,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "f4d57caceb52eb44a0ca673c93ec2a0237e78ea8d96ba81e5a14e12c1b7e4012", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216824, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "ae10c83eccbb552eaae176ec0ef7a1c25dfe9ec2bfd147164f2a971bdfef2090", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35408,14 +36428,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35424,15 +36447,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35443,7 +36466,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -35460,13 +36483,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35478,7 +36502,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35501,14 +36525,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "753252d5def1582dca4d68a4af2d13bce2ce267c0ac833486d0bdc0835ba2b9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 768, "25a10a9ee5ef81ac7334211ae7dacba95ff8f2e3f48f3a0a89200b520ee12b43", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35525,7 +36549,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35534,15 +36561,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35553,7 +36580,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35570,13 +36597,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35588,7 +36616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -35611,14 +36639,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f3092af1ebedd1adf9b42852d24d70e225a401ce4b4a14f0335f15ee23bef8da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 768, "ef7280bc148704a480100dc3f5729395a5d00be44ab4b970e909972b8e406c57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35635,7 +36663,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35644,15 +36675,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35663,7 +36694,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35680,13 +36711,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35698,7 +36730,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35721,14 +36753,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "cf819a3db90b2ce7f430c018128a28c00f6472eb161969dc6f7bd5502858b76d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "e7baeadf83baa95b01038f3314a5b10f1f26af3a766896ed26fb025dc38f3d57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35745,7 +36777,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35754,15 +36789,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35773,7 +36808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 9 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35790,13 +36825,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35808,7 +36844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35831,14 +36867,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "00b4d0839dad846db403a94108e1e153740d7ced590548c75a3ff4ce65428259", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 768, "822d2a5ce71af977115d4f4bd4202cd28b2670db1dcd102393e27bc08baeadcb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35848,14 +36884,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 0 +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35864,15 +36903,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35883,7 +36922,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 5 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -35894,7 +36933,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeB */ 16 , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 @@ -35902,11 +36941,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -35915,10 +36955,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 1024 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -35928,27 +36968,27 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 1 +, /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 0 +, /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 , /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "b7cc298b36951f6d1d20d1a6e7839e11945b6de2256f419c8aa6f7cb92003971", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "fef23f8e3d9fe50d35c3052cf030d05748b4c3b140b2609b0e112fc711c1a7b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -35965,7 +37005,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -35974,15 +37017,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -35990,10 +37033,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36010,13 +37053,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36028,7 +37072,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -36051,14 +37095,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "900059ffdaa34c18b634a20e2424c2b19a0664687a72e3c43268c7da29e2a7a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "fa8fd5c4dbe59e8615e4e70482e8edbbf0d3f8a1de24f889dbda5fab34693407", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36075,7 +37119,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36084,15 +37131,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36100,10 +37147,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36120,13 +37167,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36138,7 +37186,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -36161,14 +37209,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "393bca7f8fa59bcf1a9c21014c2d78972af57997aba3dbff1ed1bda9cfe8a967", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "173ab52cec3d313199534f7c7176d6853659a632500a38fec9fc38762ccef496", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36185,7 +37233,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36194,15 +37245,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36210,10 +37261,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36230,13 +37281,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36248,9 +37300,123 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "7e4390b97295ab91bb5b71e33f3ecc10432a1f2a0057ec74f6330b57b21a4ca2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -36271,14 +37437,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "2c2faf119a3f0af582712e1dbc97711289f76d5ea506d538531a367631f4a776", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "53da50f6b70dee4eaf7965027dd8e1c2a7601cfc6cf71368dfa7c4977ef33ccb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36295,7 +37461,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36311,8 +37480,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36320,13 +37489,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -36340,13 +37509,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36355,7 +37525,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -36381,14 +37551,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "6dd90508e3094ab31e5cff83dd120a55d08ef5348465d734818290d71d797e8f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a57dc3a61da9438ad5da55245c0a454a7b83557a8e38ca30ae8b90f30a955da3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36405,7 +37575,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36421,8 +37594,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36430,13 +37603,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -36450,13 +37623,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36465,7 +37639,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 @@ -36491,14 +37665,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "0249a035b5d34084e0ff2648f0a737cc62559458ef2b2b0f32b7b678104bc096", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "edb0f39df63835429697e6af5fe1393d7e9d12ad49b873a0cd70bd87375630e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -36515,7 +37689,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36531,8 +37708,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36540,13 +37717,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -36560,13 +37737,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36575,12 +37753,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(0) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -36601,16 +37779,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "e4c1947e0c1ff623224d9f94a31bc7aee5c4d2d845e2069b50a4c58650c83941", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 151384, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 640, "83607c3044321bf831500ea393803f1a32bad4b2406e1b1f4d09cc83277f48d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -36618,14 +37796,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36634,15 +37815,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36650,10 +37831,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 2 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36669,14 +37850,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) -, /* mTileK */ 512 +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36685,18 +37867,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -36711,16 +37893,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "5d5be167b587320edc504bb67de7181317fd5ad628c3a674ed660fcaea031ac5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "3ad9c82758a5acfd29580061a9d6dff9b170c0fb486cb34af701a4e38800a727", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -36728,14 +37910,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36744,15 +37929,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36760,10 +37945,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 2 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36779,14 +37964,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36798,15 +37984,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -36824,13 +38010,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "36d6390f160850e1fee94de15e4ff9a1f6db198f37f00d28fdb28835f670a548", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "399cba5e92d365ec8cbd55ddc38c42350cc03808017071648a15bb7a758ed8c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 3 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -36845,7 +38031,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36854,15 +38043,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1536 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36870,10 +38059,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 3 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36889,14 +38078,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -36908,9 +38098,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1536 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} @@ -36934,13 +38124,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "47e96f8f743648afcd645487c00bdbc7cce8f94308b69f4a185f3dee7617e012", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "dae1a7c8a29ad0da1e32f3955fddeedc260ac17c71dfe568e57045fd73f7f085", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 3 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -36955,7 +38145,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -36964,15 +38157,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1536 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -36980,10 +38173,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 3 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -36999,14 +38192,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37018,7 +38212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1536 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 @@ -37044,13 +38238,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "fa05f522814e760e8578f48d78b251dff0819cf733a4b443af426753ffa1b5cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "0baac1d015679e6499be16cd7bb7e1e0eddc01ec50659c143bcb7300c729760a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -37058,14 +38252,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37074,15 +38271,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -37090,10 +38287,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 4 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37109,14 +38306,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37128,15 +38326,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 2048 +, /* mValidK */ 512 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(1) +, /* mActType */ gemmGatedAct::ActType(2) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -37154,13 +38352,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "18fbd102638dc6dd8b70f2e799edd16683e1fdfa6f08f7b59ffc10e120bd5f7e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 896, "fcd028ced43e1f9c138f50b5b3be620a04778e381bbc2f80f1a4a3171ff2a7fa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 +, /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) , /* mDtypeA */ trtllm::gen::Dtype(17826818) @@ -37168,14 +38366,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37184,15 +38385,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 +, /* mK */ 1024 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -37200,10 +38401,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumSlicesForSplitK */ 4 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37219,14 +38420,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37235,18 +38437,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 2048 +, /* mValidK */ 1024 , /* mWorldSize */ 1 -, /* mActType */ gemmGatedAct::ActType(2) +, /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -37264,11 +38466,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "0d11ea83d184b85758152df122fd685436fac066200012b7e51a15c7be50ae56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 896, "50a798f26e98d26c9c2ffc30e5641f2f41bc957b695a77fe9fa44faaa924db93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -37279,13 +38481,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 0 +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37301,8 +38506,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 @@ -37310,13 +38515,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 4 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -37324,7 +38529,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfBlockSizeB */ 16 , /* mSfBlockSizeC */ 16 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mSliceK */ 0 @@ -37332,11 +38537,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 512 , /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37356,25 +38562,253 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "2f9761756913a37c3d811777b3bc9e2ff93655c28cac997b83bb667018690b60", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 1 +, /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 0 +, /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 , /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 196248, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 896, "6e31aa64887999689664132c637495efefac50392e32a72956352b7547369be0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 256 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "dc6ac074a30d89ca532d978c108faeab68578a5df50a7cd17d06676fbb9a005a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "645177775c9994dacacbc15dd9fb598262c55782106182f151093667eee604d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37396,6 +38830,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37404,7 +38841,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37423,7 +38860,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37440,13 +38877,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37455,10 +38893,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -37481,10 +38919,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "acf03f8fb6440c402378a65ffca4cbe764b91fdc5b1b4aad6271eea9e7c04008", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "9c2d7ff913b57de65bb0ba05dc8fc5b37b1366bfadf54680bcca861f530d401b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37506,6 +38944,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37514,7 +38955,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37533,7 +38974,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37550,13 +38991,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37565,10 +39007,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -37591,10 +39033,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "e570048da60df3d2179f2f437b9b1e7a7e8df1b75fe27e1337ffd13f7a2da955", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "c7197e22b59815db21c6e30723db7917e8b6f0c01ca8fdf21be37f46c3e0398f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37616,6 +39058,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37624,7 +39069,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37643,7 +39088,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -37660,13 +39105,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37675,10 +39121,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -37701,10 +39147,124 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "abc2afc519706a7f0e7e56bca0ca950c1d342a72a55ced10ac393bbbf188a22b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "ef25f9ba3a48ff637c201fc737c7ec34fbdb16096914e830a45260d8fedce591", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "237a36b5965e322879d05c2f17716c2d7f627ef7373c0625d190e63e7a614f85", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37726,6 +39286,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37734,7 +39297,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37753,7 +39316,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -37770,13 +39333,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37785,10 +39349,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(1) , /* mClampBeforeAct */ 1 @@ -37811,10 +39375,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2b8bdc1546ec47ffa676bc6c5f5e3cac5d5102c2e9b4b65a761dc64460c142cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "ad669e907866158b333ff52bf694b922088d0f1378636b3997641d7a945bab2b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37836,6 +39400,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37844,7 +39411,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37863,7 +39430,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -37880,13 +39447,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -37895,10 +39463,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -37921,10 +39489,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "4d6b0827257a95a38d389b2b43b7240ee9f61a2615c0f28267c7335bf5fd4f3b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "a2e8adb2cd75e41383a3efecc479a3c21c340cb1d6d164d473bb881d6b3a18c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -37946,6 +39514,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -37954,7 +39525,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 1024 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -37973,7 +39544,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 9 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 @@ -37990,13 +39561,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 512 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38005,10 +39577,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 1024 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38031,10 +39603,5824 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "0f12db5f5124f213ec4f70e7ecaab4f9bce3fa56d65e7d7ca9bcd54bbf5509b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "7c019518e8c168d95cb18d80c319a0cb5ae1076540aedd2c872b69899a7a9669", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2d114c8d12de713beed820d99891198ce613f28dcbf6946977aef1bb5ed95063", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "9e1e9f025d9ec5dacc0358fbda868961479f711e607784c01865316ba61fac04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183408, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "7dad4a9f4e7b8ad97da964f0798c886f97e64fb5b396046da921b3825e853ded", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "66f38539fa36e77f4372630157bb395b8d09b714d4dcb01b80213d1ba1f93e9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "9c5295e215792ef5927d373c64b43b067f2d50b925f975ddbe3fd759202c8bf6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "d74ea21f2571bc36bf9cfc9bf5ee604df5f12a2cbf3e92bb7f5435796635839d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f", 512, "ab8234cc482753d5705fca47a1bf31aba9947ae5efdc8b132b06abb717af677d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 9 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "580bf0d33f76d04487f683f860e7a8e0f512796e0ff255a8de21b4822bfb1d23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 2 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 0 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "54ca604ebdadaf7d0da1400c7b78db06b78bc893aea8bb8bc4f9d9de43c21858", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "118e3a9ff7ec8648c5d491a8f3906fee2d76e47e581c53ae5441ecfccf50a1df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "2dae6169beb9d4752f108ad026467317c26208df6881b7fef62af51cbbee0f6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "9d0af31104ad6748cfc8584e5fc0e0d6bb67ee642ea3ef0be5d4c93c2c33f32e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "8740c8dc6c19dc8f3d3e16bc04564f313063dc296174a85f079cb9cb06dc03f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "1f8812933f6a4fcc3d9e4b6fab0f5fc3744855141640ef747ba0e74038db5ab3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "60e42c5cb21a203d1889812a96b00203630f8c0f29bb9686cdbf2f0080e6435f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "95cefa483302f4ef06ede27fa897c5433a40594d62c1929d277440576dae5dd3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "d4193df3d41d70deb72073e6b1e851caac9929116b3bf84d372769a5be32921e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "e8abf24c5dbb7e8df06849dd598e80d9b165708d1bd09a5316bbe77f386c70ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 209656, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "c13f80bbac12b80bf289ac79e6aca89dffed68cce4b5622192d0720621d932ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "c1a788a6007c2ce21109b1c59efda2754d4643e7025feec33786f53f14331ca3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 3 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1536 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1536 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "0026b6867d4d2fa0602edb807ec7c239f1417a26db2b251cf5affd1726d5e23c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 3 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1536 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1536 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213752, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "8310df3140a4481987cf9ae24ba172a8e8f5a1e020f789d649062847b330fddb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 3 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1536 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 3 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1536 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "949db9eb4615d2fd91a419a78546bfc39136e00b2c011345a2ae27bbc693a98f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 2048 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "0f15f12b03cefeb3072a9cfad400604bb179ddcb5f98ed2a164e99ad7714765f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 2048 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 217848, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "867b5c25e10baf579b42512d1f91744e7ef74bfa083765c7c8e0584a04f7d46d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(2) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 2048 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(2) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "44abbc4793547b61c27e12bb9d85eb2866e1c8169dc373af4cf0b2b6d455fe9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 2 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 0 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 640, "b7c4ecf7051f8011a028935254fb1868cf99b6763500ee83ba4d7be9d2b70bfc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "33ac935bcb9c19b9ed821c9537971cb20a69a2aef485629a3d44d6f88675906a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "2d1140fef9c613e127e8fce5af20e397e42b425a9e415fe9bdc9cc48fbfde980", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202480, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 640, "41216d23c56263d53e83233aa8fcab11d01015c60df98304910ee43bf0190633", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f", 512, "e475168e5526fd76f1db1f9964dcc8e03464b904b7a10786e5964e7694f23142", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(1) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "c0d6b926ca83414946b4216cfeb133a764396e2fc2a9198077ec6288d0dbb8b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "65a12a1988ae76d24f8a45f314874b195d41924ed459be59b338f7cdc5ff874b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 512, "cdfae91fe388d3cfcb860271dcac66399c7ce433daa0e6541c41aa00f3fe6938", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 1024 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ 16 +, /* mSfBlockSizeB */ 16 +, /* mSfBlockSizeC */ 16 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 512 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 1024 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190592, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "2c0a49a404a9d6f84a819c3a16aa676b7c7a3e0c1e060ff11f3ebdcca0d7a9a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 152 +, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "c2f4eb86c167dc614e1def2299a2a4b694c1f71e78730290921866bc5896c5ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 152 +, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "5197372f88e928edcd53bdd42d38655b95a99e3a4cd154db14bb5d607d649191", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "399ac758991dcd2c288d0e9097adff7a8235b4e3a579b43fc8ff51d29e0cd557", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "2e84a6f316a6c50694c9d9cad8f6789e6e99edac1d53e98b0dd4a4759f1c8a9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190592, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "6fb07486e24dc10d2b1b5c3659b9c1f6b6692e09e6e3a6e7337322e492bfdeba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 152 +, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "e9dfb21e23f3b8d5df2a65822a00b305f23e69a236e94290138db3b4c2baba04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 128 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 152 +, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "6e1acd66eb3603f9c0d3f4fb22c0ab39632ae96dad577a552b834732b5558d2b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "3707d93bb2924745892cfea4d8a32a9eab9c7e6a2f75afbc5305e75ba0b7d885", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "9efb2838900f1924849a121e766fc36a2e67eb6d44ccd774d18cef139bc4ccd7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 2 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 256 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 119216, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "7184ceb12c566648b528e959852c6107f70a31c8f8200dade0a401b5a2b4e0e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "58efd05b2b1aeddc2bd275161f726adbd1441f5060a283d455d5019b42096fcc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 119216, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "c00a391f99417557458703ce2d583e09067349a78ecc0e4dc2a59026e6afa43f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "41694ebb37e65a93906a1bae0a04b86e723df2ec2f0b339995f82b85df3f7c73", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c802575473c80b5e47982228ff9b4c55968574e61d9ea7f0b0985f0021a5bb11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190592, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "10bea78630c8739b73ab0d765e7573899e908eba9177204c507fcef4ffec4545", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "b5595895875f5b9caab4670fec464d635781ecc644966b7b3783ae734c1d867e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38048,14 +45434,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38063,29 +45452,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 152 -, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -38100,25 +45489,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38137,14 +45527,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "34b81cc5b1ac2e1cacaeeffc8b32090a1c297e545a4a8cf3b4feef86ef9afcd5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "53ad7ca2b6c622b641ae2d3a6890d8ac37fecc0ef6b2b39f1b19d401e73b9c29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38158,14 +45548,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38173,30 +45566,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 152 -, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38210,25 +45603,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38247,18 +45641,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "1c8a14377cb53045a9917fece018815c92c95f14b43ee477e826adbc362486ac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "01738e4ee57b20ea965a25ee6b8eb15b252a2916bc828e6db05bb5516d1d0241", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -38275,7 +45669,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38284,29 +45681,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38320,13 +45717,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38338,7 +45736,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38357,18 +45755,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "1a978d243f4254610a238c87af9139b245e66858136ece17b36ede2d00aa3d7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "5dc7506bb064db53dafa33c77d446fcf25416de72ea499acce0a92b90447e18d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -38385,7 +45783,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38394,29 +45795,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38430,13 +45831,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38448,7 +45850,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38467,14 +45869,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190592, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "adab4ddcb23d3f925e3bff9d29acf1e535e70682f15b6ebfc3b6cbc3fcde881e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "db2b888a564c59975e92f4dfb7273a0e288820e30de55da7b5f658bde520c068", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38488,14 +45890,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38503,7 +45908,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -38511,22 +45916,22 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 152 -, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38540,22 +45945,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -38577,14 +45983,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "cfd70975aab9bf6bf6cf74f0fa8f9b951a46ca0dcc4dae74bbecfeb5ee7b8cc8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7eb947a1deb70da086ec6198145c44d6ac96949cfdd0354e79714252beea86f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38604,8 +46010,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38613,30 +46022,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 152 -, /* mNumRegsPerThreadNonEpilogueWarp */ 80 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38650,25 +46059,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 16 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38676,7 +46086,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -38687,18 +46097,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "bbcd503930c9ef8efe7841fc8ca42af5a06ddc8639606176157c36acee81f677", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "b9ffd797f2cf1d5d3e0e12cda3c25352b983a285336823550f5af5e45fea35e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -38708,14 +46118,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38724,26 +46137,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -38760,13 +46173,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38778,7 +46192,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38786,7 +46200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -38797,18 +46211,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 211480, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "9ee73850653d0f04787afa9cb5ed26e2f83c65b6839471294774fcf6fa885fe7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "919feb308d711a56b7dd48fe5fada2891b25c6abfd4e3147e91715cfae7694f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -38818,14 +46232,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38834,26 +46251,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 128 +, /* mMmaM */ 128 +, /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -38870,13 +46287,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 128 +, /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -38888,7 +46306,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -38907,14 +46325,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 119216, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "de5234b4efc3e2be2ba7897c3d458c9d8e995a69a7695abf20daad89a5644f20", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "692a853eb3a8a63d77393196fd081b342696009d4dec8c9559197e84d263a56d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -38934,8 +46352,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -38943,15 +46364,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -38959,14 +46380,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -38980,25 +46401,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39006,7 +46428,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -39017,14 +46439,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "4deabc5dd45af21dd1e618bd9be6edea7b3fd658d3758b3e7871c1a7a3347fdd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "996263183ed779e9843bc72e90474db2e70ead2bf6f8b84a3c2f1ef34e42e185", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -39038,14 +46460,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39053,15 +46478,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -39069,13 +46494,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -39090,25 +46515,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39127,14 +46553,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 119216, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "9be3d8c1dc22c93e4a373d7ab12c7cfc5b8c66c599de1a24efa46b1d736629e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "d083f530ef8a430031beadcd74c696ffeab9fd32c7ae4f3b725d87b3dc478531", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -39148,14 +46574,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39163,15 +46592,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 16 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -39179,14 +46608,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -39200,25 +46629,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39237,18 +46667,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "e76d65e80d21afb207a8226ed64113ea644f016c512aa78858ea3442e1df6839", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "c3a79514c7d3016530f8668761962bf7c289f787b344b4cf8ad0671bd4a69911", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39264,8 +46694,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39273,30 +46706,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 7 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -39312,23 +46745,24 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 192 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39336,7 +46770,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -39347,18 +46781,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "76616e3ed4db323c68047f2841bcb2dbc6723f7f53317bb910ca4039823266f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "c22befb7beec2d63e1d12ce3c41d0167f7a38eda9bc66d2cdff34a30a4a32773", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39368,14 +46802,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39384,26 +46821,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 7 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -39420,13 +46857,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 192 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39438,7 +46876,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39446,7 +46884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -39457,18 +46895,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "c8c927b723eb7272ee8fde04cc050e89b9cec61716d98f0497a33214b9c42af9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "08a39f242c76471499958aa5480bbf91821f9900f0234e89576d571df77055ee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39478,14 +46916,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39494,26 +46935,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 7 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -39530,13 +46971,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 192 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39548,7 +46990,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39567,18 +47009,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "0b6d17625ad09b10e807cd697c332691c32cd8b0c78609b80799a9f25a238a3e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "e0eccdcea0d80b003331c214a5ae9755acc3adfd4b53cf94ec2251daf2f59e9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39595,7 +47037,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39611,22 +47056,22 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 7 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -39640,13 +47085,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 192 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39655,7 +47101,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -39677,18 +47123,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "9bb137cf01f5a31aa59466be5ae2f7a8655196e4576ec42fe5cc67bae62bc854", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "32e2911a5a4026509518fad5fc15513582c54fe369d88d3a9145cb8bc0dae3d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39705,7 +47151,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39721,22 +47170,22 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStages */ 7 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -39750,13 +47199,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 192 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39765,7 +47215,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -39787,18 +47237,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "77ee796c0972ff675edef13b3c1246f046b49c17a80d5acfa1158c2b5357407a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "88b8a23cbe3f90e12190f600a2d4aa115a7a8438e4f9747f256d79baef9345ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39808,14 +47258,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39824,26 +47277,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 192 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 7 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -39860,13 +47313,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 192 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39878,7 +47332,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39886,7 +47340,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -39897,18 +47351,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227792, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "fc757d774dfc01f8ee56f0fac9aac14812177063b3a34043d15eea9388665702", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 640, "67613c8eb398d7053092b70585abf7e7165bff4a480327df40fd1945e7e14e13", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -39918,14 +47372,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -39934,23 +47391,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 @@ -39970,13 +47427,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -39985,10 +47443,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -39996,7 +47454,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -40007,18 +47465,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "896d43bc99e892544d14e6e1c40fa225495ebe7510f24b97263de3beca8846d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "07776452cf965f64bbe524602fc6d36ecfa69e5884599ea532a9910126b493e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40028,14 +47486,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40044,29 +47505,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -40080,13 +47541,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 256 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40095,10 +47557,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40106,7 +47568,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -40117,18 +47579,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "5565c5fd80c185d09bde0e197f7426cf75f15178dfe68ae9beb58b3662a9297e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "a4eca2369f72e396fe2b2ba093f8c2055cc146ae99cd407cdcc11f493c7a4105", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40138,14 +47600,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40154,29 +47619,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 +, /* mMmaM */ 256 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -40190,13 +47655,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 256 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40205,10 +47671,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40227,14 +47693,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "c230ec4a113e6e2d5e21ec5ed351c1cb172b845158dfef8444cde4ee131a8b7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 640, "86583677036311815e42574f9dce3319a9505bbde64b0e74666027bef64d4348", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -40255,7 +47721,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40264,7 +47733,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -40272,10 +47741,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 256 -, /* mMmaN */ 192 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 @@ -40283,7 +47752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 7 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -40302,11 +47771,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 192 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40315,10 +47785,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40344,7 +47814,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "8aea9748c1819a60940dfa078d3e75769b7b939426736f48c8d1363b56436e6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "1ec8d45fde137b3021001caf86a49cbdfd3a2b693f0512b781fcb5d1b9551fd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -40365,7 +47835,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40374,7 +47847,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -40382,10 +47855,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 256 -, /* mMmaN */ 192 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 @@ -40393,7 +47866,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 7 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -40412,11 +47885,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 192 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40425,10 +47899,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40454,7 +47928,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "1ae27eca9a3fa771f1b46f6f8cdc29d06af50750c786c1b2b40c38bd6782f06a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "e2783079f767599363804f5f8f1380f3922743479d55b61a04b8cabcab734356", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -40468,14 +47942,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40492,10 +47969,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 256 -, /* mMmaN */ 192 +, /* mMmaN */ 256 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 4 +, /* mNumEpilogueWarps */ 8 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 @@ -40503,7 +47980,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 7 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -40522,11 +47999,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 192 +, /* mTileN */ 256 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40546,7 +48024,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -40564,11 +48042,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 212472, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "53da81b182577367107d4ce72907cffc32afd1cd30d20e622a5e9be71d3b13ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133552, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "b2f063134d9204f08c963318288aff012908bde2b19f828d9f7f44da5e556671", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40578,14 +48056,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 +, /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40593,29 +48074,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 192 +, /* mMmaM */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 7 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -40632,23 +48113,24 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 192 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40667,18 +48149,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 640, "21aaf76c12b3fc47df74bbd86c0384deea1e8ce9dae233b482ca12c075e1db74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "411b5a1fc76ffa400f339d80f019fd0cc8f567505d4f2a4945389d35f920bd98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40694,8 +48176,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40703,7 +48188,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 +, /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -40711,22 +48196,22 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaM */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -40742,16 +48227,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 256 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 @@ -40766,7 +48252,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -40777,18 +48263,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "93a9379ad1f72d9a3603275c88618b1a9fc95e96afbf429d5f45a2d508475cb7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133552, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "f496b3e4db0810bca13a238b951f4d388989c1c4c6ecb32d038c11cf1a59c8f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40798,14 +48284,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40813,29 +48302,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaM */ 64 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -40852,23 +48341,24 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -40887,18 +48377,132 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 640, "47fe3d4cb8dcb6e2e012ac08c19bc445ef0e55a37f9f7a523a9cba4d7d050ea7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "ad2c4d86e29eb5509a1c2ded57801a3016941c5ae27251ff763ad939a04fce95", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 256 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "779847477fd4bb4da5c0bd609f694e36a161e34f113b83e7c23db34fed03a69e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -40915,7 +48519,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -40931,19 +48538,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -40960,13 +48567,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -40975,7 +48583,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -40997,18 +48605,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 221656, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 640, "9819b6c04ea4707483bf7fee5e05d1d35f4c1e7a834b501864e606439c257617", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "e3312bef21c92eeb469a4d59a69a0e4390cac5caf53441743242ee76062a2d7a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -41025,7 +48633,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41041,19 +48652,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 256 +, /* mMmaM */ 128 +, /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumEpilogueWarps */ 8 +, /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 @@ -41070,13 +48681,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 256 +, /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41085,7 +48697,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -41107,14 +48719,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133552, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "e239d53043619321ba67def1c1e568db27762c086b5403b5b3144fb34d00bcd4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "0bd42b53bc39c5ef383c16a53e26dae8d6d2dcbabd57dec3d98dc0ea6a7ee2d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41128,14 +48740,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41143,15 +48758,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -41159,13 +48774,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -41180,25 +48795,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41217,14 +48833,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "afe96180a941e897e032bfa90b3f3be1f5e364c570bf4a6936120b88190b5df9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "877eca3c7b7db387e8d0833a0cad9cd24382d247eb74f1a989e46d58a451a8b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41244,8 +48860,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41253,15 +48872,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -41269,13 +48888,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -41290,25 +48909,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41316,7 +48936,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -41327,14 +48947,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133552, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "0ee9c206ebc62b3adcc210817908f2970177d94dac95a3ead6e8f22b6c957a6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "b558ba1579ef08f2ed2b3a1b170785e9748e58d3559a51e32b5b7cbb6d3cf54d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41348,14 +48968,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41363,7 +48986,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -41371,7 +48994,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -41379,14 +49002,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -41400,22 +49023,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -41437,14 +49061,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "c486f36d1360bb4b6b62cb72961abbc1ca51aa5b4f87577c3dcbd06e3c87dfec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "befcf2a22cc3cf63262518d0b0a41d4b4419544d5e276727f563704d6db12a2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41458,14 +49082,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41473,7 +49100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -41481,7 +49108,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 32 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -41489,13 +49116,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -41510,22 +49137,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -41547,14 +49175,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "780d767449edfa48e39cc780f4e682e0d645895ad0dc00c4785f8b602d4a419f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "62ff3b8c90693617df3001d326aee7a6930acc7a63ca2f6e12e2393c46206685", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41576,6 +49204,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41584,7 +49215,121 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 1 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "7715f47c5135b4bb261721e71f95e5504fac6ee8b53d822b7382ba6df0a19e83", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -41627,6 +49372,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41635,10 +49381,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41646,7 +49392,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -41664,7 +49410,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "8bc7a9eb910b1759647cc869c7d76302135de4e666655db998ba870b5b2519be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "25d50e75c7051a0124c73b74ef05f4ef125b7a74947756fa32dd1993e88ec287", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41678,7 +49424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41686,6 +49432,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41694,7 +49443,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -41737,6 +49486,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41745,10 +49495,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41774,7 +49524,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "96faae3fd1d5d672746ceee471ea645ae791a2cb7bc95b363215b3acd402aa92", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "639f2a6cbdd30ded2f20c94e1dec2c51c4186be449b6cffe341f62a6506f20fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41796,6 +49546,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41804,7 +49557,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -41847,6 +49600,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41855,10 +49609,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41884,7 +49638,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "45ad4c2603016f88b5c3f9a43bf744f77854f97c09ff3950dbab7277a88a00f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "e92c75c751de4e213a73cd3cb24a0f1cff45a669b2a51c08b45b2557d643b3f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -41906,6 +49660,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -41914,7 +49671,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -41957,6 +49714,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -41965,10 +49723,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -41994,7 +49752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "aff4d9bacf5e87172a64929b6220e5a4749e9d60ac3d838075c5721549e277be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "dc2c960237809caf9b53824f03f6c3dbede7487341cf68ed87f7ee97987d6f43", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -42008,7 +49766,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42016,6 +49774,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42044,9 +49805,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 5 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -42063,10 +49824,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -42086,7 +49848,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42104,7 +49866,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213424, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "2121822aa18c44e48206609dd993983225e3b008694c0f9ee886f654cf90776e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 164016, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "68f1fed01df47d98b7e50e165405cf017ccc193df3f1dd6908668c0b4ec26f23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -42118,14 +49880,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42133,29 +49898,257 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 64 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "86f01acb60e8bde158dd8646193473fa183240b3a3bedd403da2e0130fe3f026", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 128 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 128 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 164016, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "fc58e912146d1f7b93f8492ec7f76343befb242d15a95396173d8b3fa01d3e74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -42170,25 +50163,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42207,14 +50201,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 4 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "0605a1e713d94d18eebf51f2c4cd110fb426cd13eb25377abdaacf50b459a746", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "63449a3fc3950e963844eb49a4e617b2886dee8cc5da2deb88ba96425f9fbdbb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -42234,8 +50228,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42243,29 +50240,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 64 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -42280,25 +50277,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 32 +, /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42306,7 +50304,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42317,18 +50315,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 4 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "fad72e41f566cf086d8f9f9a2bc3a569f882b780ccad4819154e8a2886725bde", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "e5d30f53a37b9d7d546117c347f1f3a692996d7e18a79130621149742615f401", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42338,14 +50336,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 +, /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42354,29 +50355,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 +, /* mMmaM */ 256 +, /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 5 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -42392,11 +50393,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSplitK */ gemm::SplitK(0) , /* mTileK */ 256 , /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileN */ 64 +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -42405,10 +50407,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42416,7 +50418,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42427,18 +50429,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 164016, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "14b4121f74c5325fdaa9eb893f28ffff7fdc6a4c6b4bbab0352ef86060d7c229", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "848568de867415876ecc2324f27f6aafbe15a8ec375a52d9a76135a8553da762", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42448,14 +50450,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42463,15 +50468,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 256 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -42479,13 +50484,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -42500,25 +50505,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42537,18 +50543,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "22d2c93c7ce2d6b262259d4ced9585b578adce682b37bbbe2445e505b299f321", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "c59966028657efd1eb71cc5441718bd3b749f076fd10fa7c1637cee0db2181b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42558,14 +50564,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42573,15 +50582,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 256 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -42589,14 +50598,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -42610,25 +50619,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42647,18 +50657,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 164016, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "dedf038750ba4522b0cc81e393705d652b65089173e45522aa9898a7968778e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "3da0b443881c71ca893f51a171877436394a32bd828f0e84d9861e7318d17b4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42674,8 +50684,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42683,15 +50696,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 256 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -42699,13 +50712,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -42720,25 +50733,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 64 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42746,7 +50760,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42757,18 +50771,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f", 512, "c92f86f3267cd9cf3fa61358213156b1771f405265b50118cfea4b24abb72e57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "3d17cd4accd5e945ec1eb6c119ea446a3642db2772db232ee531f3c5615786ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 1 +, /* mClusterDimX */ 2 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42778,14 +50792,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42793,15 +50810,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 256 , /* mMmaN */ 64 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -42809,14 +50826,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 128 +, /* mNumRegsPerThreadNonEpilogueWarp */ 56 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 +, /* mNumStages */ 5 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -42830,25 +50847,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42867,14 +50885,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 4 +, /* mNumWarpsLoadB */ 8 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "97c17ccb77239cc24f67749ea08bbd6b9ba9505742a5df82616bc75e10d926b5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "b171248735734be7863fbdd224ba8d3eab0d3340afb46941125aadc42ff53d39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -42888,7 +50906,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42896,6 +50914,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -42904,7 +50925,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -42947,6 +50968,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -42955,10 +50977,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -42966,7 +50988,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -42984,11 +51006,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "d707910d4b43668d45a660c10853105334ce986ad5d7a8fd564fdb20ddc4ac75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "caebbaf5771493937cde67c16b6677a764139c37b845d26c2fc3318045fa7a5c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -42998,14 +51020,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) -, /* mEnablesEarlyExit */ 1 +, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43013,30 +51038,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 64 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 4 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -43050,25 +51075,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43078,27 +51104,27 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 0 +, /* mIsStaticBatch */ 1 , /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 128 +, /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadA */ 0 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 2 +, /* mNumTokens */ 0 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(2) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f", 512, "fdc7f5c9ef8f981f6e7cfde61ff766ea5e9ede83ef7314155ccf95d19c4c829e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 148240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "846b532c796dc0e127798665c6524e7d019d85fc681eec1bbdadfc48c7d8d566", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -43114,8 +51140,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43123,29 +51152,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 64 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -43160,25 +51189,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 64 +, /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43186,7 +51216,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -43197,18 +51227,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin_len, 218552, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f", 512, "afbbfe45241b22715d97ca37d335ccc77369f7a41ea68688d4b76febaf8e3d60", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "e12aec7b0ed57c48c93b429221281667d69882bc77a876f933acef919cbd5f5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 -, /* mClusterDimX */ 2 +, /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) @@ -43218,14 +51248,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43233,30 +51266,30 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 512 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 128 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 256 -, /* mMmaN */ 64 +, /* mMmaM */ 64 +, /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumEpilogueWarps */ 4 , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 128 -, /* mNumRegsPerThreadNonEpilogueWarp */ 56 +, /* mNumRegsPerThreadEpilogueWarp */ 160 +, /* mNumRegsPerThreadNonEpilogueWarp */ 48 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 +, /* mNumStages */ 8 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -43270,25 +51303,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 256 +, /* mTileK */ 128 , /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseShuffledMatrix */ 1 +, /* mUsePerTokenSfB */ 0 +, /* mUseShuffledMatrix */ 0 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 512 +, /* mValidK */ 128 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43307,14 +51341,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 8 +, /* mNumWarpsLoadB */ 2 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "5e2e258dca62724ab45922f00bc5652a05bad01f9823a6249809ab09e15be569", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "efc37cf74d1a51fe2bb804060927f47810f9ed8f9640debd4f5f25c75533769c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43336,6 +51370,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43344,7 +51381,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -43387,6 +51424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -43395,10 +51433,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43424,7 +51462,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 148240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "36773cfa9b99b0ff4d25c5b2a552bde2d0edd1e2bb66c2e1a31a098a5b9519be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 148240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "08888a00664ed94f652eccbf1e643e4fd127fa3affe599709e8937e7b0cef514", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43446,6 +51484,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43454,7 +51495,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -43497,6 +51538,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -43505,10 +51547,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43534,7 +51576,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "b708c6e1a57de0692a219e8ba73663d6b8fcca40e6ec65c2cadc60693819eef2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "aa028807fd5a6a7e6d74d8ba1dbca40a88a852c3ab39baa7c810ce97e7b216b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43556,6 +51598,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43564,7 +51609,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 128 +, /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -43607,6 +51652,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -43615,10 +51661,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 128 +, /* mValidK */ 256 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -43644,7 +51690,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "11328576ef18d587a24abf0e95dc6459531509088a8a4a814658941d9363d4ab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "e47ef1efb0abc48b8e8527f980306b095ed7b4a7c855abe1ab847538a9ab104a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43659,13 +51705,16 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEltwiseActType */ gemm::EltwiseActType(0) -, /* mEnablesEarlyExit */ 0 +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43673,7 +51722,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -43681,7 +51730,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -43693,10 +51742,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -43710,22 +51759,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -43736,25 +51786,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 -, /* mIsStaticBatch */ 1 +, /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 -, /* mNumBatches */ 2 +, /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadA */ 0 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfA */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 -, /* mNumTokens */ 0 +, /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 , /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 148240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "e7c95d138fb8526a68525f61fcabfa378b6ff0e767a5dfad18c4b1ed27c36251", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "c91514ec1d85c0c728a2f65df78a89df337d27c54bb2ab19aa4f25a5dc12655a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43768,14 +51818,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43783,7 +51836,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -43791,7 +51844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -43799,13 +51852,13 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 @@ -43820,22 +51873,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -43857,14 +51911,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f", 512, "a04193becbe4b2951af6be63662d570c3dba78593fb3c78b1619446c044218c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "3c57f969caeeadc8b8a844f011107d7e8e15c8805949459589a114f8172bb87e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43878,14 +51932,17 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 , /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 +, /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -43893,7 +51950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 +, /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 256 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) @@ -43901,7 +51958,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 +, /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 @@ -43909,14 +51966,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsCastAWarps */ 0 , /* mNumRegsCopySfLdsSttm */ 0 , /* mNumRegsCopySparsityInfo */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 160 -, /* mNumRegsPerThreadNonEpilogueWarp */ 48 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 +, /* mNumStages */ 6 , /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -43930,22 +51987,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSliceK */ 0 , /* mSparsityA */ trtllm::gen::Sparsity(0) , /* mSplitK */ gemm::SplitK(0) -, /* mTileK */ 128 +, /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 -, /* mUseDeepSeekFp8 */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseShuffledMatrix */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mValidM */ 256 , /* mValidN */ 256 , /* mValidK */ 256 @@ -43967,14 +52025,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadLoadSfB */ 0 , /* mNumTokens */ 2 , /* mNumWarpsLoadA */ 0 -, /* mNumWarpsLoadB */ 2 +, /* mNumWarpsLoadB */ 0 , /* mNumWarpsLoadSfA */ 0 , /* mNumWarpsLoadSfB */ 0 , /* mRouteImpl */ batchedGemm::RouteImpl(2) , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "497b0f26b6690cda895ae72ebebbb9a2f304196d69c7db8911a018842fbc946f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 256, "9a5b799859e98ce8f34031d7644a9bace4cc972494bc298344dc984ca367e079", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -43996,6 +52054,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44024,9 +52085,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -44043,10 +52104,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44084,7 +52146,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "4ffed82144fdf040ec77bfa44fcd6c07770a0b7a8f92e48ff72a40e84b116399", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "9a4e01b26d82e92734dd9a9d01ede8079e8806ee0441db7f565347db4fc514f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44106,6 +52168,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44134,9 +52199,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 2 +, /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -44153,10 +52218,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mTileScheduler */ gemm::TileScheduler(0) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44194,7 +52260,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 256, "f00cd8c1985b2766807644ad37e10034049d6cf64362eb85107fd46c35be2fbe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "627b2ac38c34b5ef656882a55caa1bc59da0ff9d68ecbfed7e6bf10790c6a503", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44208,7 +52274,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44216,6 +52282,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44267,6 +52336,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44286,7 +52356,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -44304,7 +52374,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "0fb8a1dfd2521f25fa1db0178b7d28ee2e8a1e53d233c4b68303e9d60dc86734", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "90d07e6ab949b4bf1757da549fc01208a5e3c7fa008a105c43cefa91fa247aae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44318,7 +52388,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44326,6 +52396,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44334,7 +52407,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mGridWaitForPrimaryB */ 1 , /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 256 +, /* mK */ 512 , /* mKernelTraits */ {} , /* mLayoutA */ gemm::MatrixLayout(0) , /* mLayoutB */ gemm::MatrixLayout(0) @@ -44354,9 +52427,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 , /* mNumStages */ 6 -, /* mNumStagesMma */ 1 +, /* mNumStagesMma */ 2 , /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 , /* mPatchF2fp */ 0 @@ -44373,10 +52446,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileK */ 256 , /* mTileM */ 128 , /* mTileN */ 8 -, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTileScheduler */ gemm::TileScheduler(1) , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44385,10 +52459,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mValidM */ 256 , /* mValidN */ 256 -, /* mValidK */ 256 +, /* mValidK */ 512 , /* mWorldSize */ 1 , /* mActType */ gemmGatedAct::ActType(0) , /* mClampBeforeAct */ 1 @@ -44396,7 +52470,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -44414,7 +52488,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "60dba8c49d0f2f9a7aa945ca9ef52fc0433091712926387419b7223de55e47e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "3c48dfb93ea065bfb1fc3b5e0c4193caf3769b43889c39299b4b7c7bd7e7f2d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44428,7 +52502,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(0) +, /* mEltwiseActType */ gemm::EltwiseActType(2) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44436,6 +52510,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44487,6 +52564,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44506,7 +52584,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mBatchStrideInTokens */ -1 -, /* mFusedAct */ 1 +, /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 , /* mIsUniformNumTokensPerBatch */ 0 @@ -44524,7 +52602,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "756e8559479ba4d67e8d6d92e47a64e8774bbd43a9107e8b7150323db6ab5803", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214480, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 384, "8b1f34836e0d9459464a99402205a13cc3bf91d416eb5061b9b3e76f3e494961", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44538,7 +52616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEltwiseActType */ gemm::EltwiseActType(2) +, /* mEltwiseActType */ gemm::EltwiseActType(3) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44546,6 +52624,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44597,6 +52678,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44634,7 +52716,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 256, "05947b85dd17b929cd15da57fa3ad78b2c6531bd038de579cf2dd54fbe98ac04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f", 256, "5c1143b3be806923ad1f13870e678d73a4ac54b75c779d68c43f9429066b5c44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44656,6 +52738,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44707,6 +52792,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44744,7 +52830,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "711d19e0cc7889cb0b868d16e56e29e8c40d728703d8d272d39e3ced1cd4bc0f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "be59f07d33ee73a7f28ec940cfc05ef783a89d9df9f6306f4cba7cd007de51d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44766,6 +52852,123 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 +, /* mFuseUtccpWithUtcmma */ 0 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 512 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumEpilogueWarps */ 4 +, /* mNumRegsCastAWarps */ 0 +, /* mNumRegsCopySfLdsSttm */ 0 +, /* mNumRegsCopySparsityInfo */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mSfBlockSizeA */ -1 +, /* mSfBlockSizeB */ -1 +, /* mSfBlockSizeC */ -1 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mSliceK */ 0 +, /* mSparsityA */ trtllm::gen::Sparsity(0) +, /* mSplitK */ gemm::SplitK(0) +, /* mTileK */ 256 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mTransposeMmaOutput */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseMaxTmemOverlap */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 1 +, /* mUseShuffledMatrix */ 1 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mValidM */ 256 +, /* mValidN */ 256 +, /* mValidK */ 512 +, /* mWorldSize */ 1 +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 +, /* mFusedAct */ 0 +, /* mGridWaitForPrimaryRouting */ 1 +, /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 +, /* mNumBatches */ 128 +, /* mNumRegsPerThreadLoadA */ 0 +, /* mNumRegsPerThreadLoadB */ 0 +, /* mNumRegsPerThreadLoadSfA */ 0 +, /* mNumRegsPerThreadLoadSfB */ 0 +, /* mNumTokens */ 2 +, /* mNumWarpsLoadA */ 0 +, /* mNumWarpsLoadB */ 0 +, /* mNumWarpsLoadSfA */ 0 +, /* mNumWarpsLoadSfB */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(2) +, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} +, /* mUseTmaOobOpt */ 1 + }, gemm::SmVersion::Sm100f}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f", 256, "36e0a578cf64ee49d253a78c357095026d1f5d804914aa246b233f4433702011", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClcFastDrain */ 1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(3) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44817,6 +53020,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44854,7 +53058,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "7586849862f48c35dc51ef72de4e3614fddfe4d4e9dc81ac181615f9cbcd93c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "c9b1d2561c959a94b0fdb87bf086963d1df9d6676a7cf1a0bce91b7f162f0b80", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44876,6 +53080,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -44927,6 +53134,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -44964,7 +53172,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "b322893a56acf627a2c76ba065a3e006f16b4e7ba61ddfa566854cedbacfb330", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "1d6b606fc88ee6fc7fad44d838d6a6c5277cc965c815cf3a0cb6e28085e30300", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -44986,6 +53194,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45037,6 +53248,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45074,7 +53286,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "9a42bf19e464c20f02c763dac3f06691dac7675b5fc384fe79b693322bb97b54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "91850eca1bf5b2eaf94e3a46b0ab682e772c656f09be0898f7030a897faba4aa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45096,6 +53308,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45147,6 +53362,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45184,7 +53400,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "da07bbc5373000c6d5a453d46b0f4c9620fa4a88210390c049b0bc00ad0042fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "8f9f5b9287f41103864e9eab684c1618078ad2da274f0cd3fc57797213a68092", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45206,6 +53422,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45257,6 +53476,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45294,7 +53514,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197200, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "9932153ef5ad86fc4b7696506c9075145a00a492c2ace8697c262fcfc0eea43b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197200, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "96988a185ae179937cd0075911076d651ff4341ab52b2e7906bc3fcabbc495e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45316,6 +53536,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45367,6 +53590,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45404,7 +53628,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "769cb2dfc0085f361090c8252efab80500b844aec3a2c7dbd9b0593caf974acb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "3f91cc7e530610714d37d1f4f445240cb2b06738eb36027d69af9296904d6eaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45426,6 +53650,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45477,6 +53704,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45514,7 +53742,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197200, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "d08d72b8f206822bf85906244e4f05effbb198d637c6303cb46d839cda6614ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197200, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "4021172dea380144838afc9aa1724fd31b213e5f456b6f2957fd74d443107ba4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45536,6 +53764,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45587,6 +53818,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45624,7 +53856,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7ba3ca52424fef94ee1169be10ea2ffe86c866aada27f167d1c7031a62372b42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "2195f0af0858d5263a3be244c298c045639a6d862869cc513e71e0f5261842b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45646,6 +53878,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45697,6 +53932,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45734,7 +53970,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219728, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "e939b686444d876f82e0e8af796aaea14da9cbabfabb735ff19f99648461aacc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219728, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c9b760668328173094a04a2b02271a9676f0752420b5ac4f356497e3f27a5155", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45756,6 +53992,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45807,6 +54046,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45844,7 +54084,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "f500b0672062287cf5385da6b8f4cb3b8dc93922faf0466b55820963bccf702d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "b9cf360be3f0bc8949972a61f1b4f3f3345f184b2412a47ba57e63ecf8a59aa0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45866,6 +54106,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -45917,6 +54160,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -45954,7 +54198,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219728, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "f7fe273a644d76d3de6a6ae9e6105522fd73c2be3208e9aae648b462791c167d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219728, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "1529b73b4514bc42bbc245590453f653d15d4945c318bdc55d53e2a5e7ef99fb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -45976,6 +54220,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46027,6 +54274,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46064,7 +54312,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "fe8c4bf3f2c7514381f7d124eaa33c910f0af6a04bb07dd5ff826b9a2a8af703", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "cd1b01ba95e1b642f4dfd44e6d944a6ffd710385e5534687b419b6a0e2ea9358", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46086,6 +54334,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46137,6 +54388,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46174,7 +54426,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214544, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c3c0291634ae7e86f158af51dc8b383431dab655f91e95baf129e2737c51a640", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214544, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "8da9fd9e492c359a1b7c785e1f1d69f7fa438a96ff93e4a883a01de10c0acbd2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46196,6 +54448,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46247,6 +54502,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46284,7 +54540,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "a72bc6a9c2270996a0624aa357cc81234ccd1f8aa236cbe8176936aafc77bcc2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c3e56810270e2d35d3e72cdd4eccba186f559ea4a0a65d8dbc0bc2dfacd6df14", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46306,6 +54562,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46357,6 +54616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46394,7 +54654,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214544, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "6c7c13c61d622ce70ec512272aacc5711018f4293304139f395398c21e40f315", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214544, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "508b9d11109bd267e5039d7be3f3a2d80a64f24345d2a57ff06926ed73aa6aad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46416,6 +54676,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46467,6 +54730,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46504,7 +54768,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "66a517777b4a9912e343155ef8dfbf6d6f71efc209be382e1b0daafbffe4cee3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7160076f2cb8a882c489b5484986faa9e43f8f412e87ce4cb02cbc1b4acb34f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46526,6 +54790,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46577,6 +54844,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46614,7 +54882,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185936, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7a6531a59fe86b239952561e418fa58180e47a5562ea58bfb7d59d55cf928781", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185936, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "e3c98d05937fe236de2ac659d3b93815d51a124bcd87620e073d156189a59d53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46636,6 +54904,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46687,6 +54958,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46724,7 +54996,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "e03e22f1da49e9ed9d612496e91f4ecf3d624a6e4cdf1aac8bd15eb9f4f8f030", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "b6e03f7051e85cd83905490dcf16b95fdd8d115b5b173b00bc92c3f1b125039e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46746,6 +55018,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46797,6 +55072,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46834,7 +55110,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185936, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "fb10688b64b96277647f6dcaf802d5619fba6fccd3c4f806213e1717172c21cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185936, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "3aac61a827af266fe0db5f1f105612f87405bccda994674ce50083874ab4b9b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46856,6 +55132,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -46907,6 +55186,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -46944,7 +55224,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "09f9eb32dc8060da21f95b8ddf523e04f7a66ee204137e29a755dde89a05cc22", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "17422ea5910409c94098a70735101168f87558ce9a837cde1f38482391ad7d07", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -46966,6 +55246,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47017,6 +55300,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47054,7 +55338,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "6cd27a06ae4bb374c82acd80ab65999641f69e7cf264d0c881041ba8c7eb4279", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "6ccb4de6f5dbc961a9cdf66ce5f817baafef0d374f8528d6e72f95dbf787e736", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47076,6 +55360,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47127,6 +55414,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47164,7 +55452,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "c1df1ee3e72a70b18cea186c7280db2b9f8f6227465919dd4878a7780ba2b15a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "27ccd6725ca48d1371eb9c644547fd7867a1b4222827f0a5b305dbc18f80a1b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47186,6 +55474,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47237,6 +55528,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47274,7 +55566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "1ee92a002680f4e394d5d47fd9c98b17359afc641dc4e928db83ce46eeb50e18", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "da65a901cc1f66d38bdc142b2182ba364669b8a2f986c63ee7886b59a87d748b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47296,6 +55588,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47347,6 +55642,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47384,7 +55680,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "df418684aeaed5424aaa7e00be7d9314046ac93978969f469e02b5ab3b4be02f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 384, "7266ee335763dca433893ba379b03ca24356b73fd3d5adc459b5b97bee6aaf69", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47406,6 +55702,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47457,6 +55756,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47494,7 +55794,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "37c01e5b0de6a25a3e8b967d31b8b473abc4d38fd516df7ef9bc2c98359a7c34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "74048d78f165c4718ab87b7d5eaa15fd2bf748c896efb00fe7444dc2a9544641", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47516,6 +55816,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47567,6 +55870,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47604,7 +55908,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "7e766a1b960c9954298a62b543c4772cbbce450c6869c3e1251a90800a107a42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f", 512, "40455ada6cdd666dcb565e1b8b343d398a2552b5b7e352340fee622f94acb9b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47626,6 +55930,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47677,6 +55984,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47714,7 +56022,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "cfdcd9f9efa9bc71c623d6a90af473bf2bf4b7e292f66d88f255ac1530bc1e8b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "1d1de948d9f3c2ce81c3aa82c51adc504fe005051d772daa78f645312a015a21", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47736,6 +56044,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47787,6 +56098,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47824,7 +56136,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "294765edcf388260cb6dfc0daba7346a01a94eea91dc009bb99dfc23aef04ffc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f", 384, "564d334dd5f5e2a08aedc01517266b027757e3f75034f281e0be279594fc2230", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47846,6 +56158,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 64 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -47897,6 +56212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 1 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -47934,7 +56250,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "3321838e40ff160f9418f33b7770187e9176f0f26fa54c8406498eadfb5932b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "cc935a19a3175c2ce7532824e99c5046c7e5bf911d0d34421d3fc55455c2e78b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -47956,6 +56272,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48007,6 +56326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48044,7 +56364,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "45d2b9c2d0897ea2533e861a8cb5f05476e57bfecd84380bd9e1e0c3a9f2f31f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "bb71c40f99f8397a4e11e563ee20fdef16abf4d5654ffe8e9148940c70f8d8b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48066,6 +56386,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48117,6 +56440,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48154,7 +56478,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "4ae229fc250b0b3004c13124172ffb6d395b2ac5eeca8df4f557655d5c2781b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f", 256, "ba3b54ff4b255d4f18a0f24786e072ac49d268878539f7b05ba58e0b6831aafe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48176,6 +56500,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48227,6 +56554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48264,7 +56592,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "e39b66baca6e3e685fbbc4082d610427f5e99d14769ecf75f94c6cbc8a3295c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f", 256, "a7b683d3578de3e088a52661c12bf0432c58ba73ae61a760d417e327de9df4ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48286,6 +56614,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48337,6 +56668,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48374,7 +56706,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 191304, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 896, "24dc8f20fdda3f9405c61f05fd5e361cce6ad9dc4ff67fcc4957e9c3798e222c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 191304, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "8fb34cc699c2afd026b97b99b7c98069f6103dba1046cdb9efc85e85ee90d185", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48396,6 +56728,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48447,6 +56782,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48484,7 +56820,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 191304, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 896, "9cc8cfaf654f07c100b1114379471bf4b1e3d4aba7a22b9e245b41170a5d9066", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 191304, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "6cf59d2ffad1abac9ea0207789377c47d68b15e3866329b3982752ae4ea3e5db", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48506,6 +56842,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48557,6 +56896,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48594,7 +56934,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 215640, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 896, "344313c5bf095b97c20e12694ffbbb5df9781951e4b77f93e17d94739a9b12a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 215640, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "4c41fae50baf76c1e591ea4e64dec9850d5cc19fb2cc56e48788c6e4bab2e785", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48616,6 +56956,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48667,6 +57010,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48704,7 +57048,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 215640, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 896, "746bcc9c8c8788c918497857e7c81f6241e156b0c56a88bd67791c097f4b3179", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 215640, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 768, "8137a224a406ab71faea54302da52fcd5062e1d2d5b8c5f1be329c98ec120c4f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48726,6 +57070,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48777,6 +57124,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48814,7 +57162,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "f8de453d3db18b8ba1aa00aba60cefb2830f635fbee0bc48d1d8e4951e93f8e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "669d1f3e85ed7e382ec8ce85c02f89a4b427392dee8b3e379bb9620830c1305f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48836,6 +57184,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48887,6 +57238,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -48924,7 +57276,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "eb6e6676e0ac0cae7650e24b6772b1245c1da7acd8f93e277fbbb200a043cdf0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "6ba88963ffd4b393c04fe4c37cebad82d2d03ea735385469133111ce2753505a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -48946,6 +57298,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -48997,6 +57352,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49034,7 +57390,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 223064, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e9ac99c08d21259fc96c003dbece1d517120371653d4ccaaeb2948b32f4a58e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 223064, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8f97f50c18713800744704fcfa38d723421ef4c9d69ee69c9c560d46868f6349", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49056,6 +57412,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49107,6 +57466,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49144,7 +57504,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "666201c688d1a3b7d3f599a511dbf13e6f34480372780e4f6f9c857a365103c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "8526c51cd4d393bc4f0f04cf12ff21f9a87354a5af2687d9d54585f18134402b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49166,6 +57526,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49217,6 +57580,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49254,7 +57618,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "2e04bd5c26ce05537c97b7e0c0362c83da2704220612ea31f04dc3a38e222bc3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "15fcbc55273f88c9fe2109ddb25dc609a755910294bd51cfc297d0d684613774", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49276,6 +57640,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49327,6 +57694,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49364,7 +57732,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "8dbe2863e08fb5c7eff12288ecad419ee4b391e42ae3c3e6eb7a58a95534a413", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a8b87f00a9a88cd55f84bb947670344dca20bea4291b6eebfbe5a574f4b96c89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49386,6 +57754,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49437,6 +57808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49474,7 +57846,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 223064, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "4d8a1f1e2a120749dbc883bd8240b5bc187d16dd6a75cccf274ab23edec83d8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 223064, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7589183f08285e6d80b242657a5fb2ca6adda34b54f8f086fe57375875d8d2a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49496,6 +57868,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49547,6 +57922,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49584,7 +57960,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "257f386e11faa678a59c37642951ca6d7774f9ef2f794b4110be35a6abec186e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7a0b2ba2c717de65e9360ab27d191b154a4cd6cc9917f121ea89bf982acd4cf8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49606,6 +57982,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49657,6 +58036,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -49694,7 +58074,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 222952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 640, "e57dc702ccb6087a51d55c71850762515d2b044693171c0416c984931c4ba4d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 222952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 640, "c185c35b7377b0401896a87dfa4ce944f0a0b7440fd278bbf8dd16c6dcc12b58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49716,6 +58096,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49767,6 +58150,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 @@ -49804,7 +58188,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 222712, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 640, "2eca134f8c7e145abbe6325e6b3056d5823d76cdfd7f88942ac35dff4669778f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin_len, 222712, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f", 640, "48089a53fffd0f685c867381d5e2551fa848016cc00384be1df66ec173e2ed6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49826,6 +58210,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 1 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49877,6 +58264,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 1 , /* mUsePerTokenSfA */ 0 @@ -49914,7 +58302,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220912, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "f5c8c88fcb4fc0ac488056767f95904425a99e9f98fbd8d66ccb27fb07b98e68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220912, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "2091c5e04fc62bbaa3358ef69f2a084c90fa756fec2ff67b46399b4df8102d28", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -49936,6 +58324,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -49987,6 +58378,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50024,7 +58416,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "7e6c3cb848ff20ecf6d7303c83f9937428f687c5c0643aaf608d6633f7b92d6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "c1feaf2571010fb9f2be7f979d37ca59abb1be88c2fda0fe27c304cc7cacf13e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50046,6 +58438,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50097,6 +58492,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50134,7 +58530,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200440, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "5bd2ecbbd7d798cbda041d557cdb9549c235450cae9bdd96585f4b978958a2cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200440, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "2960a42c78a8167e09188e9080c6b7647a74ce45a047346c7c16afe5152511ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50156,6 +58552,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50207,6 +58606,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50244,7 +58644,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a723b51c5b52cea003d40b4cc107eea8a181a6119e95e424a984d5f44310abab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a237fb90a828800a78688e06af857e7a63f0b219b44553a5b4897a48d14d5a29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50266,6 +58666,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50317,6 +58720,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50354,7 +58758,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220912, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "a105e47a5482008800c677870eba80d7c0396a9c1e8a67ff838205b4ff0b39c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220912, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "96c2880b3cf546ec1b50723208dad43d8868a53190ecaf064ca12eb9358bd263", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50376,6 +58780,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50427,6 +58834,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50464,7 +58872,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "4a720b3fc4cf9a997c1b7b3b8b79a8c11744db2571bca9fe73588d09af01ab95", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "12fac2e41ca2f4df89b67e072abbad569929f9d5abb65bfa870b52577f50be15", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50486,6 +58894,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50537,6 +58948,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50574,7 +58986,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200440, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "bd59464adebf444e96ea92ee669cddf27c03676a830e8de93875bbf2908ca417", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200440, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "5e85308851dc873ef44af28b6d5555b61b91a562f1ab3af8d97b733fb2878f56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50596,6 +59008,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50647,6 +59062,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50684,7 +59100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "f13fcf00d270f4b5a7e3b5944638295f6775effbc68968ce3456afa9c7d646b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 640, "5434d7dd06013c88e5cfbe01019f4a3da1a9397ed3260f4aafc9bde2a497fec0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50706,6 +59122,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50757,6 +59176,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50794,7 +59214,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 163768, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "54ada705c8fd493a56dda3e476f9b172d9a3b86e9537c3d0241f0791aaa5f4c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 163768, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "c72283d7f1a49a605f2aeb88e18a1b9e9246ca0d348feb64bca71c3692a5f292", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50816,6 +59236,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50867,6 +59290,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -50904,7 +59328,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 163768, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "cd1f8c02ecbdcc189f78c2f0af276779acad979cf28d939ce47997f8b389c4a6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 163768, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "4fc79fe4b5934b77b3b81db60bfccfef2a0d8df3260f6882d165c9e5438bc8e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -50926,6 +59350,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -50977,6 +59404,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51014,7 +59442,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 183960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "1f43ba0c6128a56554abeef0dd0a4c2d949911d70bf4d7ff7215ddcc3e409f6a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 183960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "988189d954260b7ed37cec117494b985e1b1b606ad15e524cabbf8f8537c9148", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51036,6 +59464,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51087,6 +59518,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51124,7 +59556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 183960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "e0a5926416bb9cf0f0e7eb592409673173d4c72244e96aed0359b8e5fa67cc60", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin_len, 183960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f", 512, "13d0ef3cb4a531d55ca90221b49d41b9b73cc2f7aace5a45d0fd69e0468d9a8d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51146,6 +59578,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51197,6 +59632,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51234,7 +59670,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222032, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f768813198a55a00d8ed3a12470de2160cfb4e52b4c7e833366d4ce56c69e25d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222032, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "2dbfaddede04e7c9e956c809e153ebfb26d666701cbe695898685a1e0122ec1c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51256,6 +59692,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51307,6 +59746,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51344,7 +59784,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e020e38d3944425ca1cb0fb9a0242969206b091ca967b71e5f66e33a92917bed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "ee3e08eba2e3fb93f727394846c6be6b5f0aa81372ab08d6685d0d2df75fd1a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51366,6 +59806,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51417,6 +59860,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51454,7 +59898,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 229208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "4d4bb9c69b05ee457f986d91c36957730faea961a6f1bba62e52befe30c0ecaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 229208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "fc994ac086a8ebeaecffd675ea75bb069943cb3e120417727d4b53f822813f52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51476,6 +59920,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51527,6 +59974,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51564,7 +60012,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "cc8a290a2ce0090ad0d4e6a8847f9b40fe3a1e03813aaa6f98e4d365bc540939", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "82229539dd406d8f7b479a785cfc6ba1d47dff7959bf96eeb8438db74430c8f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51586,6 +60034,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51637,6 +60088,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51674,7 +60126,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222032, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "5af12edc89f39febd2eb67633b6ad47763807f5a3e919bd7a0ae38591323cd0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 222032, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "9b183337c866d7fa15d7affe110003e7459f7f18b46bed933460c577a2f657e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51696,6 +60148,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51747,6 +60202,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51784,7 +60240,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e51f7d406cbd70ce63028d2d6c136c5a6cd03e208ca0c95981270c65d53f963b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "5656b81d7da500b4ef2b812cb7cabe3da8767547fdf3f4d422e26bc3810ec7c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51806,6 +60262,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51857,6 +60316,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -51894,7 +60354,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 229208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "30a6cc69501ff4824b3d668ce3ac40de124aeafcaa6bd1fb1c4f85c98e2ff9ee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 229208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e2b0fac3661c66711a9d434119d500a2c66cbaa624f8711b9267067f5e8ab52f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -51916,6 +60376,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -51967,6 +60430,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52004,7 +60468,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b5a49cc431ab557db3d83eeaad12af64295dce6e2e1ae2215b61ddca9303296e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "116610b681524294c19eea59d78d7bc97f623fbcf2c6c025f633fbd319d86553", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52026,6 +60490,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52077,6 +60544,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52114,7 +60582,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b441cbe049fb2f55609985dbde2057a19ebbb54a542e49fd763c0ab1fb689359", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "cb0327a0756b927e9dcd1f2cb0c5a2b4904a6375faaeff648bf4f925849389c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52136,6 +60604,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52187,6 +60658,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52224,7 +60696,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "42aa3877378223c0c9ed379e34d1629e93bdbcff6370513a2f34680e90b7785c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "89e1d2b66e2c728239c5385d978b5a3caa04b64991f10be85db18accceaba6f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52246,6 +60718,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52297,6 +60772,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52334,7 +60810,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 228920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "9d5674c9bc313f19f3b27ba56a6da9a32cff34ca8f859572a31ca72ac7cf3206", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 228920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "f8a905b2f5e374a3491a6ac780ec683c5451c8f68c0ec1c6012f287f72a8784f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52356,6 +60832,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52407,6 +60886,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52444,7 +60924,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "fcc43a512dcfe9ed47de1bc99287874b8131197432a6c46394e8180f5b582c7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "48a32f670dfaa8fec069aeda46b3fe796dca58bf75440ca6ade29642a60c7aeb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52466,6 +60946,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52517,6 +61000,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52554,7 +61038,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "7153955fcbb178d625aa57d993e60c5c4855d4de99f048e1037b3a15dbdcc13b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e471cb063c4f9212af112970864a4d324ca150bc36c937f3b46b9b9107eea271", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52576,6 +61060,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52627,6 +61114,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52664,7 +61152,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "74c369f693ffa0aabdfa54f1c1a6cdcded710eceacd8a76bcf7e0ac2ad12283d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "b5cb1bc045ba80c5f5927361cca5935db2f8efd32e89f38612c1cb6175204f39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52686,6 +61174,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52737,6 +61228,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52774,7 +61266,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 228920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "4d2f9d65c91c37832dfb7b914ad9ca05f601e5d613bf3f7b900a44c56273d119", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 228920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "a50ea86fb4c6fe1443f9fee3f3cca712e1f4f7666570b13dd887d827433ef8bc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52796,6 +61288,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52847,6 +61342,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52884,7 +61380,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "e0f1646866253d44843d167cfb133d3b8f9550953c1f3341a428c7837da9defa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f", 512, "63a0ecd05f47672619bf0981d59d599a80c15f2482cfdfc8fbaee56c6b238dbc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -52906,6 +61402,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -52957,6 +61456,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -52996,7 +61496,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { }, gemm::SmVersion::Sm100f}, #endif // EXCLUDE_SM_100F #ifndef EXCLUDE_SM_103 -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "7bf96e3e93bd2006beec269543860721faa0c224a0e193155950ff88801b8c44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "6358abcdfed4f5155b6d1711ab34ed1a3112aaee0bd7c68d614ff67d5ae89c86", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53018,6 +61518,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53069,6 +61572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53106,7 +61610,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "48b6c68052d1f51b115a403401a80593480e4d3ab6b9f6989b6fd6493482c312", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "cf46a5e62e6998a15affc809152dd1169d9c107fce8b9f908f9b28e8a28cf6d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53128,6 +61632,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53179,6 +61686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53216,7 +61724,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "ccc02ddfccd84f6713cf001ceb5beb6936a6815033a6142731da158a0d4f6a74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116304, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "044bf86f52e4fdcf640d28350948a5dde7c93f255ecc41bba169c2b9952d2d1e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53238,6 +61746,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53289,6 +61800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53326,7 +61838,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "0fdd3d6345b5da1794a8a6d8b2617bc2ff48127fcf112d9f82f77b5ed2510e56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "ca6b2e8105ce8f3a2c411bcff6043745fe454a71cc87d0c7d1132d22851dfa2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53348,6 +61860,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53399,6 +61914,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53436,7 +61952,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "bcce3decbcd6b1b9c69c9273e8763d6388ed4cba02f92be5c05c3d2f9f877a02", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "4823f7b6f5aab0e8c92dd89cd4b9f68e5b42f7c2a96076ffa316b136e00fab87", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53458,6 +61974,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53509,6 +62028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53546,7 +62066,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "3fc5c2a980a5885e94376b55d2d6fe13655491dbd5d8569486be27b38073b868", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "ccc6fa6ac6b4f70fa301bbcc91ade6525e497396df9f6374c679751e468aa0b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53568,6 +62088,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53619,6 +62142,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53656,7 +62180,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "0989f429f113bb2e5b7cc12a76c9097650fd32e074f1c75a5c1a2fc771c66446", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140880, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "557f9b3dc6a951a5163c1cd03ba4a81ed4a409f68f5e8b10923c36edc3b8296c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53678,6 +62202,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53729,6 +62256,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53766,7 +62294,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "f076a2a745bcc3762baad9c3480bfa4741002d161ff9c3e131e0c4d630d0ce89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "a42db182d845e4c6a75ddb6579a6a1735e52448763f6ca36589743c1a897f488", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53788,6 +62316,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53839,6 +62370,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53876,7 +62408,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "8e42e235febb0e85d8080d7ca5bf383ca8d1b76843632e93c5e52802713a4170", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "f851c2fd6fdd20af6228028cf440f5313d6349b944a6d054ddb25a750d5b4d39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -53898,6 +62430,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -53949,6 +62484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -53986,7 +62522,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "bf80f622a05c39dc49c10d8ba559c92e8909c572200c431c1bf6aa6a09e11a6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "cb9c4ed3d1b4cbeb6c7120662543cf3bc201030c527c4f3ed77d5bdfbf15f3f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54008,6 +62544,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54059,6 +62598,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54096,7 +62636,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "a23bedd7e910be05bae205bea4046aca5cf74ffecaeec5f846ffd0dc880005cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 157200, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "4c3879ab73444eea663658ce826093267a4b8bd0acdb67bd9b35677e98a9afcb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54118,6 +62658,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54169,6 +62712,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54206,7 +62750,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "0e4fe19af0178c0a675fbd88ebe90128f5e02ccde10cdcda72a45a767f549742", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "421e2a85f1740320a3ce25452b819506230962e20de5f1cbc4bd34c14010a39f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54228,6 +62772,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54279,6 +62826,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54316,7 +62864,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "6fa1555201efab77a26be48589cbf70824b3722837166f0cbf2f7617b1b64166", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "1d668e6c939995681a444b7a3f6cc916eb047b2f153d148eaed705a845aad448", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54338,6 +62886,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54389,6 +62940,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54426,7 +62978,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "daba14d07fb7c7a6d2284e4bd32596e84d387be3eb98ea456f6faa7a555bef05", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "bcb163f14dd6e1a16814d47ac73de162f63768402058d10cd0d74eaaf11fd7e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54448,6 +63000,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54499,6 +63054,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54536,7 +63092,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "861eba1756780b8bb45a7872087295ab5b0611474e150300be2eed09dc51b5c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 104016, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "e04a16402becd94bb3e2b058027516064400dbe453475ecb93e4d4d6b5433ba0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54558,6 +63114,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54609,6 +63168,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54646,7 +63206,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "ad62e4f8bfa2bc9b90c839551a269cfff5f6e64a16d33dad780cf5db9ce86bb6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "09f2bd280997995ea84a34c058730515078c2e3bd0c94b655fa8921d1fd9734c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54668,6 +63228,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54719,6 +63282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54756,7 +63320,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "5d7b6a642090fcab9b843752c687ae54203c5c0299997419abaa3d6995b72e12", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "04296d4c571d32fdc83bbb7c83bf091e95d5a4042a8c8ce69691622c5cab60a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54778,6 +63342,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54829,6 +63396,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54866,7 +63434,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "30ddc0e963e6c3ea6a961f2a28c0779b1aa5e18f86901015d68044ffba8e7b97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "1322d65b500daf02b7a279d182ab7947a86f778c36cfbdb6c30178dd1cace63d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54888,6 +63456,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -54939,6 +63510,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -54976,7 +63548,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "7897a1aaa7a9149bd6965349700dee6f4b3d7eee0daa700ce00691a59ce5beb0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123344, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a", 512, "43413bc1cce52825388ef672b9330cf40542d118f4a0a535c0caf0e0e2e78e4e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -54998,6 +63570,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55049,6 +63624,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55086,7 +63662,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "064e961e8553f3b1af22f15b1cbfebbc0acb9b42eca1178ca6f59b5f1d447bc5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a", 384, "3bf2eb544813ee36ad8b7e851e7b4bf464a9e59baa4f63525d3248c78cb598d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55108,6 +63684,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55159,6 +63738,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55196,7 +63776,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "186ecbfbc612e8fae7c4834342ee37c849bef9113743831005f54554ef33c1e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "200b204bb284ef1540fb8f051941e9ae5aa53e39e16053f2fa2b4a7051affdaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55218,6 +63798,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55269,6 +63852,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55306,7 +63890,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "c4624011ee18e5c2080b53d11e3b656bd375ffb0bc18bfe4cb47086bc1a5b10e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "0ad4bd0be61ef93ad558182713c991bd101dc150a674da53f8296a3cfb610599", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55328,6 +63912,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55379,6 +63966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55416,7 +64004,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "4e156047a3f86312e09947943720da9334d66a34e3a81e62c1653e10017c65f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114256, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "088b9c793924a53de401cbc00459c9fc8e8a0329295b7003d38da7cfbc642d0f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55438,6 +64026,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55489,6 +64080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55526,7 +64118,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "f394ea8f3c6cb11d4711669f85260a8b135518cf51ea13b3b8b555fdcf291fa4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "e93c0577c9a4ecb616268b736df1c68e7e4c3be2d4d3da45b9920a0816971a29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55548,6 +64140,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 16 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55599,6 +64194,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55636,7 +64232,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "ce70042d1fda34cf57de3bd6653ad677f6922c77a5e9b68ec00d9eea01c600be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "9432442220a030ba762d3945129d42f8a57ade7a3ea9e4b933d1e7598a823aee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55658,6 +64254,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55709,6 +64308,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55746,7 +64346,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "be921d5a381b6d7a5b1347bf647c6c6cdd6d5a8539aa23752c7a01093d467144", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "0fe5f85f5a2097f849d571e606fa4ac1d59b7df15f29ebdc70ed70954c22732b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55768,6 +64368,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55819,6 +64422,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55856,7 +64460,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "0fe5ed379374148e39c3ded179490e9cdebfd0ede5d5d3212c3a5ee790c632d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136784, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "e69485d9e6cdef4235601e7477113fce9ecfa7833738cceb8e6f386728cef56b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55878,6 +64482,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -55929,6 +64536,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -55966,7 +64574,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "86d7bb7a2e38c5700ad2fcbb2b6103233055413de0447bf1a14fda3b7dae1ab7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "e87fe52cda1348701af57103ccb63395f2dc5d46b4a2cc1de8502f753dd4ad04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -55988,6 +64596,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 32 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56039,6 +64650,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56076,7 +64688,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "848df26c6e4c9850f6c16cd7cc4fe076b350dad1acc5e267a5db2541e66dfb35", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "f26a308ef60f7b13c482b63310fcdd928cd599c60dafb6b0e8fd552f6413226b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56098,6 +64710,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56149,6 +64764,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56186,7 +64802,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "41e53a515977996b9593db892d5a8b52d6bd0caa636dae27cd1cbb3e707cae1b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "049ae2ba3c8b24ec45b5cbe0e8d843cdd0672fddde05acf408c44a34c098643b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56208,6 +64824,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56259,6 +64878,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56296,7 +64916,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "527a5084ec9a965bfa05ffbc4095520baece8612266a51b98e413ea9097685be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 149008, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "de9fc69b25e030dd0a816d5285936ea4b0b92c05fe1b981df2096439f38318e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56318,6 +64938,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56369,6 +64992,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56406,7 +65030,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "7efe8733abb57b589d188dfd7bc05e05518b8c45dbebb9767e9ad1921c033223", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "a1ffb3f005555e72a8bb6ec16af258aa7c91eb39634e8ffce556c45538271876", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56428,6 +65052,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 64 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56479,6 +65106,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56516,7 +65144,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "6105ff9eb131e04861aae6d735592a54fb37415a8497b5fa5b903ff1286c91d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "2ec67b7ab25da1128b82508f8fc77213d575a08cbc6f74f0dfd9999b84e9f0ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56538,6 +65166,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56589,6 +65220,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56626,7 +65258,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "efabcf160754497ad550d50f6ea252cff44302d5f266876ccfa50d6ac3ed2d48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "9b196a2d829cfbe73554028d19d5119c313797543109ce4615b32f760488fe59", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56648,6 +65280,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56699,6 +65334,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56736,7 +65372,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "2c3905243260b59a6c43ac50ea18ccb69c27347368bd49aad3d965323ce9dc9a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102992, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "fbccd1747a3a8f89adc54a946da3e45479204ad6d45d9c486911f65f23155a83", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56758,6 +65394,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56809,6 +65448,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56846,7 +65486,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "722a40cf50181813441a446227612607c8bb17e6c70d1b654eb85900736aa721", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "6fd7dafd18db5800efce218fd91638b1df54127f2dd10534fffe9ce7a197a8f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56868,6 +65508,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -56919,6 +65562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -56956,7 +65600,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "c4a9c09b6b2369ce7b42a3bc1a8204c5d0544df65469cf4e2298f105ed832498", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "af8e04f0129a02665a4f584e382c6792c7229bb5210f5e9c04a88f73c7adb8b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -56978,6 +65622,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -57029,6 +65676,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -57066,7 +65714,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "2b55a2f62ca0f6e4a6e4f6f32961bd260176c6edb0b8b0d845a6f9a27c4780e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "3b2d474b0ae75f56d9fa1ccdbff7f40fd3b900907c0b5e7badf1dcf23334b96e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -57088,6 +65736,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -57139,6 +65790,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -57176,7 +65828,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "98e5f4c53c78c441c753180f023a664b775f948a64dbecd9983342e73a4dab11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122320, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "7ca885f110e2cd576cf46a3f8bcfa0c6430d708247df1002e51900ae442b8439", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -57198,6 +65850,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -57249,6 +65904,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 @@ -57286,7 +65942,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "5fd73c99b3c18a78fbddcb3322ae2550cb5643199c4503d86d35c25d66f20de7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a", 512, "3de99c245f139ad3299e46db54eabc11dba14504b97b99398b84c029a56b360b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClcFastDrain */ 1 @@ -57308,6 +65964,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 +, /* mFallbackClusterDimX */ 1 +, /* mFallbackClusterDimY */ 1 +, /* mFallbackClusterDimZ */ 1 , /* mFuseUtccpWithUtcmma */ 0 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 1 @@ -57359,6 +66018,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTransposeMmaOutput */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseDeepSeekFp8 */ 0 +, /* mUseFlexibleClusterDims */ 0 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseMaxTmemOverlap */ 0 , /* mUsePerTokenSfA */ 0 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h index c4c3d9587d4..5b6810938fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h @@ -193,7 +193,7 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int sizeM, int siz if (matrixType != MatrixType::MatrixC) { // When using 2CTA MMA, we only need to load half of the tile in each CTA for B. - if (matrixType == MatrixType::MatrixB && tileShape[1] > 1 && options.mClusterDimX == 2) + if (matrixType == MatrixType::MatrixB && tileShape[1] > 1 && options.mClusterDimX >= 2) { tileShape[1] /= 2; } @@ -226,7 +226,7 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int sizeM, int siz // Create the TMA shape/stride for A/B block scaling factors. static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK, - tg::SfLayout layout, int sfReshapeFactor, const int32_t numEltsPerSf) + tg::SfLayout layout, int sfReshapeFactor, int32_t const numEltsPerSf) { // The outer dimension. @@ -524,7 +524,7 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc // Build TMA descriptor for gmem A block scaling factors. auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches, options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, - tg::SfLayout::R128c4, options.mSfReshapeFactor, numEltsPerSfA); + options.mSfLayoutA, options.mSfReshapeFactor, numEltsPerSfA); params.tmaSfA[0] = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); } @@ -646,7 +646,30 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; int32_t const numEltsPerSfA = options.mSfBlockSizeA; - if (options.mRouteSfsImpl.value() == batchedGemm::RouteImpl::NoRoute) + if (batchedGemm::doesRouteImplUseTma(options.mRouteSfsImpl.value())) + { + + // The input is NOT padded: + // [act0, act1, act2, ...] + + // Build TMA descriptor for gmem A block scaling factors. + // Pad number of scaling factors to the nearest multiple of 16 because of the TMA 16B + // alignment requirement. + auto numSfsInK = options.mK / numEltsPerSfA; + numSfsInK = ceilDiv(numSfsInK, 16) * 16; + + auto numSfsInValidK = options.mValidK / numEltsPerSfA; + numSfsInValidK = ceilDiv(numSfsInValidK, 16) * 16; + + auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideAbc(options, options.mNumTokens, + options.mN, numSfsInK, 1 /* tileM */, options.mTileN, options.mTileK / numEltsPerSfA, + MatrixType::MatrixA, options.mNumTokens, options.mValidN, numSfsInValidK); + params.tmaSfA[0] + = gemm::buildNdTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA), + /*doPad=*/false, + /*doSwizzle=*/true); + } + else if (options.mRouteSfsImpl.value() == batchedGemm::RouteImpl::NoRoute) { // The input is padded: @@ -655,8 +678,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc // Build TMA descriptor for gmem A block scaling factors. auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN, - options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, - tg::SfLayout::R128c4, options.mSfReshapeFactor, numEltsPerSfA); + options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, options.mSfLayoutA, + options.mSfReshapeFactor, numEltsPerSfA); params.tmaSfA[0] = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h index 36c7e819817..c0d9ee1dbb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h @@ -230,7 +230,7 @@ struct KernelParams // The pre-activation scaling factor (typically dequantA * dequantB) for non-gated non-linear // activation. - // Only used when non-linear activation is applied (e.g., GELU, Relu2). + // Only used when non-linear activation is applied (e.g., GELU, Relu2, Silu). // When used, scaleC should be quantScaleC only, and this scale is applied before the // activation. Shape is [B]. float const* ptrScaleAct{nullptr}; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h index e73decab006..b18ad67bfbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h @@ -390,77 +390,86 @@ class KernelTraits } // Per-token Scale Factors - { - // Number of bytes for per-token scale factors - auto const numBytesSmemPerTokenSf - = (usePerTokenSfA ? (tileM) * sizeof(float) : 0) + (usePerTokenSfB ? (tileN) * sizeof(float) : 0); - // Number of bytes alignment for per-token scale factors - auto const numBytesAlignmentPerTokenSf = 16; - // Add info. - smemChunkNames.emplace_back("smemPerTokenSf"); - numBytesAndAlignmentPerSmemChunk.emplace_back( - std::make_pair(numBytesSmemPerTokenSf, numBytesAlignmentPerTokenSf)); - firstChunkReuseSmem.emplace_back(false); - } - - // Bias - { - int32_t numBytesSmemBias = 0; - if (isBiasTypeN(biasType)) - { - numBytesSmemBias = tileN * sizeof(float); - } - else if (isBiasTypeM(biasType)) - { - numBytesSmemBias = tileM * sizeof(float); - } - else if (isBiasTypeMn(biasType)) - { - numBytesSmemBias = tileM * tileN * sizeof(float); - } - // Number of bytes alignment for bias - auto const numBytesAlignmentBias = 16; - // Add info. - smemChunkNames.emplace_back("smemBias"); - numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); - firstChunkReuseSmem.emplace_back(false); - } + {{// Number of bytes for per-token scale factors + auto const numBytesSmemPerTokenSf = (usePerTokenSfA ? (tileM) * sizeof(float) : 0); + // Number of bytes alignment for per-token scale factors + auto const numBytesAlignmentPerTokenSf = 16; + // Add info. + smemChunkNames.emplace_back("smemPerTokenSfA"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numBytesSmemPerTokenSf, numBytesAlignmentPerTokenSf)); + firstChunkReuseSmem.emplace_back(false); + } + { + // Number of bytes for per-token scale factors + auto const numBytesSmemPerTokenSf = (usePerTokenSfB ? (tileN) * sizeof(float) : 0); + // Number of bytes alignment for per-token scale factors + auto const numBytesAlignmentPerTokenSf = 16; + // Add info. + smemChunkNames.emplace_back("smemPerTokenSfB"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numBytesSmemPerTokenSf, numBytesAlignmentPerTokenSf)); + firstChunkReuseSmem.emplace_back(false); + } + } - // Per-block absolute maximum for multi-warp reduction. - { - // Number of bytes: number of epilogue warps * number of tile columns. - auto const numBytesSmemBlockAmax = transposeMmaOutput ? 4 * tileN * sizeof(float) : 0; - // Number of bytes alignment. - auto const numBytesAlignmentBlockAmax = 16; - // Add info. - smemChunkNames.emplace_back("smemBlockAmax"); - numBytesAndAlignmentPerSmemChunk.emplace_back( - std::make_pair(numBytesSmemBlockAmax, numBytesAlignmentBlockAmax)); - firstChunkReuseSmem.emplace_back(false); - } + // Bias + { + int32_t numBytesSmemBias = 0; + if (isBiasTypeN(biasType)) + { + numBytesSmemBias = tileN * sizeof(float); + } + else if (isBiasTypeM(biasType)) + { + numBytesSmemBias = tileM * sizeof(float); + } + else if (isBiasTypeMn(biasType)) + { + numBytesSmemBias = tileM * tileN * sizeof(float); + } + // Number of bytes alignment for bias + auto const numBytesAlignmentBias = 16; + // Add info. + smemChunkNames.emplace_back("smemBias"); + numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); + firstChunkReuseSmem.emplace_back(false); + } - // SmemConstSfBuf - // A buffer used to copy constant values to TMEM. - { - // Do we need the buffer? - bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; - // Number of bytes for the buffer. - auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; - // Number of bytes for the alignment of the buffer. - auto const numBytesAlignmentConstSfBuf = 16; - // No need to reuse the first chunk. - auto const reuseChunksSmemConstSfBuf = false; + // Per-block absolute maximum for multi-warp reduction. + { + // Number of bytes: number of epilogue warps * number of tile columns. + auto const numBytesSmemBlockAmax = transposeMmaOutput ? 4 * tileN * sizeof(float) : 0; + // Number of bytes alignment. + auto const numBytesAlignmentBlockAmax = 16; + // Add info. + smemChunkNames.emplace_back("smemBlockAmax"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numBytesSmemBlockAmax, numBytesAlignmentBlockAmax)); + firstChunkReuseSmem.emplace_back(false); + } - // Add info. - smemChunkNames.emplace_back("smemConstSfBuf"); - numBytesAndAlignmentPerSmemChunk.emplace_back( - std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); - firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); - } + // SmemConstSfBuf + // A buffer used to copy constant values to TMEM. + { + // Do we need the buffer? + bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; + // Number of bytes for the buffer. + auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; + // Number of bytes for the alignment of the buffer. + auto const numBytesAlignmentConstSfBuf = 16; + // No need to reuse the first chunk. + auto const reuseChunksSmemConstSfBuf = false; + + // Add info. + smemChunkNames.emplace_back("smemConstSfBuf"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); + firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); + } - // Create SMEM helper object. - mSmemAllocatorHelper - = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); + // Create SMEM helper object. + mSmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); #if 0 // E.g., // Chunk 0 smemLoadA: 32768 bytes, 1024 alignment, false, offset 0 @@ -470,146 +479,145 @@ class KernelTraits // Chunk 4 smemGmemC1: 65536 bytes, 1024 alignment, false, offset 65536 // Chunk 5 smemRowMax: 512 bytes, 16 alignment, false, offset 131072 // Chunk 6 smemSliceK: 0 bytes, 16 alignment, false, offset 131584 - // Chunk 7 smemPerTokenSf: 0 bytes, 16 alignment, false, offset 131584 + // Chunk 7 smemPerTokenSfA: 0 bytes, 16 alignment, false, offset 131584 + // Chunk 8 smemPerTokenSfB: 0 bytes, 16 alignment, false, offset 131584 mSmemAllocatorHelper.print(); #endif - } - - // - // TMEM - // - // [..D..][..A..][.SfA.][.SfB.] - { - std::vector> numBytesAndAlignmentPerTmemChunk; - std::vector firstChunkReuseTmem; - std::vector tmemChunkNames; - // Matrix D - { - // Two set of TMEM resources for D share epilogueTileN columns, - // | set0:epiTileN0 | set0:epiTileN1/set1:epiTileN0 | set1:epiTileN1 | - auto const numCols = mUseMaxTmemOverlap ? 2 * tileN - epilogueTileN : tileN; - // Number of columns for accumulators. - auto const numTmemColsD = numSlicesForSliceK * numCols * numStagesMma * tg::dtypeGetNumBits(dtypeAcc) - / tg::dtypeGetNumBits(tg::Dtype::UInt32); - // Number of columns for D alignment. - auto const numColsAlignmentD = 2; - // No need to reuse TMEM. - auto const reuseChunksTmemD = false; - - // Add info. - tmemChunkNames.emplace_back("tmemD"); - numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsD, numColsAlignmentD)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemD); - } - - // Matrix A - { - // We use TMEM for A if we use slice-K or if we need to cast A. - bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); - // Number of columns for A. - auto const numTmemColsA = useTmemA ? numStages * tileK - / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) - : 0; - // Number of columns for A alignment. - auto const numColsAlignmentA = 4; - // No need to reuse TMEM. - auto const reuseChunksTmemA = false; - - // Add info. - tmemChunkNames.emplace_back("tmemA"); - numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsA, numColsAlignmentA)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemA); - } - - // Sf A - { - // Does the MMA require block scales in TMEM for A? - bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); - // Are the block scales constant? - bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); - // TMEM cols group size in the K dimension. - int32_t kGroupSize = 4; - // Number of columns per stage. - int32_t const numColsPerStage = useBlockScalingA - ? ((tileK / (kGroupSize * numEltsPerSfA)) * tg::getTmemColStridePerGroup(tileM, mmaK, kGroupSize)) - : 0; - // Number of columns for scaling factors of A. - auto const numTmemColsSfA = useConstSfA ? tg::roundUp(numColsPerStage, 4) - : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); - // Number of columns for Sf alignment. - auto const numColsAlignmentSfA = 4; - // No need to reuse TMEM. - auto const reuseChunksTmemSfA = false; - - // Add info. - tmemChunkNames.emplace_back("tmemSfA"); - numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsSfA, numColsAlignmentSfA)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemSfA); - } +} - // Sf B - { - // Does the MMA require block scales in TMEM for B? - bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); - // Are the block scales constant? - bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); - // TMEM cols group size in the K dimension. - int32_t kGroupSize = 4; - // Number of columns per stage. - int32_t const numColsPerStage = useBlockScalingB - ? ((tileK / (kGroupSize * numEltsPerSfB)) * tg::getTmemColStridePerGroup(tileN, mmaK, kGroupSize)) - : 0; - // Number of columns for scaling factors of B. - auto const numTmemColsSfB = useConstSfB ? tg::roundUp(numColsPerStage, 4) - : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); - // Number of columns for Sf alignment. - auto const numColsAlignmentSfB = 4; - // No need to reuse TMEM. - auto const reuseChunksTmemSfB = false; +// +// TMEM +// +// [..D..][..A..][.SfA.][.SfB.] +{ + std::vector> numBytesAndAlignmentPerTmemChunk; + std::vector firstChunkReuseTmem; + std::vector tmemChunkNames; + // Matrix D + { + // Two set of TMEM resources for D share epilogueTileN columns, + // | set0:epiTileN0 | set0:epiTileN1/set1:epiTileN0 | set1:epiTileN1 | + auto const numCols = mUseMaxTmemOverlap ? 2 * tileN - epilogueTileN : tileN; + // Number of columns for accumulators. + auto const numTmemColsD = numSlicesForSliceK * numCols * numStagesMma * tg::dtypeGetNumBits(dtypeAcc) + / tg::dtypeGetNumBits(tg::Dtype::UInt32); + // Number of columns for D alignment. + auto const numColsAlignmentD = 2; + // No need to reuse TMEM. + auto const reuseChunksTmemD = false; + + // Add info. + tmemChunkNames.emplace_back("tmemD"); + numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsD, numColsAlignmentD)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemD); + } - // Add info. - tmemChunkNames.emplace_back("tmemSfB"); - numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsSfB, numColsAlignmentSfB)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemSfB); - } + // Matrix A + { + // We use TMEM for A if we use slice-K or if we need to cast A. + bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); + // Number of columns for A. + auto const numTmemColsA = useTmemA ? numStages * tileK + / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) + : 0; + // Number of columns for A alignment. + auto const numColsAlignmentA = 4; + // No need to reuse TMEM. + auto const reuseChunksTmemA = false; + + // Add info. + tmemChunkNames.emplace_back("tmemA"); + numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsA, numColsAlignmentA)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemA); + } - // Sparsity info for A - { - // Number of columns for the sparsity info for A (note: for Dense, this is 0). - auto const numTmemColsSparsityInfoA - = numStages * tg::getNumBytesSparsityInfo(sparsityA, tileK) / 4 /* bytes */; - // Number of columns for Sf alignment. - auto const numColsAlignmentSparsityInfoA = 2; - // No need to reuse TMEM. - auto const reuseChunksTmemSparsityInfoA = false; + // Sf A + { + // Does the MMA require block scales in TMEM for A? + bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); + // Are the block scales constant? + bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); + // TMEM cols group size in the K dimension. + int32_t kGroupSize = 4; + // Number of columns per stage. + int32_t const numColsPerStage = useBlockScalingA + ? ((tileK / (kGroupSize * numEltsPerSfA)) * tg::getTmemColStridePerGroup(tileM, mmaK, kGroupSize)) + : 0; + // Number of columns for scaling factors of A. + auto const numTmemColsSfA = useConstSfA ? tg::roundUp(numColsPerStage, 4) + : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); + // Number of columns for Sf alignment. + auto const numColsAlignmentSfA = 4; + // No need to reuse TMEM. + auto const reuseChunksTmemSfA = false; + + // Add info. + tmemChunkNames.emplace_back("tmemSfA"); + numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsSfA, numColsAlignmentSfA)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemSfA); + } - // Add info. - tmemChunkNames.emplace_back("tmemSparsityInfoA"); - numBytesAndAlignmentPerTmemChunk.emplace_back( - std::make_pair(numTmemColsSparsityInfoA, numColsAlignmentSparsityInfoA)); - firstChunkReuseTmem.emplace_back(reuseChunksTmemSparsityInfoA); - } + // Sf B + { + // Does the MMA require block scales in TMEM for B? + bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); + // Are the block scales constant? + bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); + // TMEM cols group size in the K dimension. + int32_t kGroupSize = 4; + // Number of columns per stage. + int32_t const numColsPerStage = useBlockScalingB + ? ((tileK / (kGroupSize * numEltsPerSfB)) * tg::getTmemColStridePerGroup(tileN, mmaK, kGroupSize)) + : 0; + // Number of columns for scaling factors of B. + auto const numTmemColsSfB = useConstSfB ? tg::roundUp(numColsPerStage, 4) + : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); + // Number of columns for Sf alignment. + auto const numColsAlignmentSfB = 4; + // No need to reuse TMEM. + auto const reuseChunksTmemSfB = false; + + // Add info. + tmemChunkNames.emplace_back("tmemSfB"); + numBytesAndAlignmentPerTmemChunk.emplace_back(std::make_pair(numTmemColsSfB, numColsAlignmentSfB)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemSfB); + } - // Create TMEM helper object. - mTmemAllocatorHelper - = MemAllocatorHelper(numBytesAndAlignmentPerTmemChunk, firstChunkReuseTmem, tmemChunkNames); - } + // Sparsity info for A + { + // Number of columns for the sparsity info for A (note: for Dense, this is 0). + auto const numTmemColsSparsityInfoA = numStages * tg::getNumBytesSparsityInfo(sparsityA, tileK) / 4 /* bytes */; + // Number of columns for Sf alignment. + auto const numColsAlignmentSparsityInfoA = 2; + // No need to reuse TMEM. + auto const reuseChunksTmemSparsityInfoA = false; + + // Add info. + tmemChunkNames.emplace_back("tmemSparsityInfoA"); + numBytesAndAlignmentPerTmemChunk.emplace_back( + std::make_pair(numTmemColsSparsityInfoA, numColsAlignmentSparsityInfoA)); + firstChunkReuseTmem.emplace_back(reuseChunksTmemSparsityInfoA); } + // Create TMEM helper object. + mTmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerTmemChunk, firstChunkReuseTmem, tmemChunkNames); +} +} // namespace gemm + public: - // The MMA kind. - tg::MmaKind mMmaKind{}; - // Whether fuse Utccp into the MMA task. - bool mFuseUtccpWithUtcmma{}; - // Whether use the max TMEM overlap trick. - bool mUseMaxTmemOverlap{}; - // The number of epilogue warps. - int32_t mNumEpilogueWarps{}; - // Helper for SMEM allocation. - MemAllocatorHelper mSmemAllocatorHelper; - // Helper for TMEM allocation. - MemAllocatorHelper mTmemAllocatorHelper; -}; +// The MMA kind. +tg::MmaKind mMmaKind{}; +// Whether fuse Utccp into the MMA task. +bool mFuseUtccpWithUtcmma{}; +// Whether use the max TMEM overlap trick. +bool mUseMaxTmemOverlap{}; +// The number of epilogue warps. +int32_t mNumEpilogueWarps{}; +// Helper for SMEM allocation. +MemAllocatorHelper mSmemAllocatorHelper; +// Helper for TMEM allocation. +MemAllocatorHelper mTmemAllocatorHelper; +}; // namespace batchedGemm //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -680,9 +688,16 @@ inline int32_t getSmemOffsetSliceK(KernelTraits traits) //////////////////////////////////////////////////////////////////////////////////////////////////// -inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) +inline int32_t getSmemOffsetPerTokenSfA(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSfA"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int32_t getSmemOffsetPerTokenSfB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSfB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h index d09ffb7f298..8c1a6347322 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h @@ -19,6 +19,7 @@ #include "trtllm/gen/DtypeDecl.h" #include "trtllm/gen/MmaDecl.h" #include +#include #ifdef TLLM_ENABLE_CUDA #include diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json index 51f7d7895ee..2ad25f095da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json @@ -205,7 +205,8 @@ "fusedAct,act,eltwiseActType": [ [true, "swiglu", "none"], [true, "geglu", "none"], - [false, "swiglu", "relu2"] + [false, "swiglu", "relu2"], + [false, "swiglu", "silu"] ], "sfLayoutB": "linear", "useUnrollLoop2xForMma": [true, false], @@ -230,7 +231,8 @@ "routeSfsAct": "tma", "fusedAct,act,eltwiseActType": [ [true, "geglu", "none"], - [false, "none", "relu2"] + [false, "none", "relu2"], + [false, "none", "silu"] ], "sfLayoutA": "128x4", "sfLayoutB": "linear", @@ -254,7 +256,8 @@ "fusedAct,act,eltwiseActType": [ [true, "swiglu", "none"], [true, "geglu", "none"], - [false, "none", "relu2"] + [false, "none", "relu2"], + [false, "none", "silu"] ], "sfLayoutB": "linear", "useUnrollLoop2xForMma": [true, false], @@ -275,7 +278,8 @@ "fusedAct,act,eltwiseActType": [ [true, "swiglu", "none"], [true, "geglu", "none"], - [false, "none", "relu2"] + [false, "none", "relu2"], + [false, "none", "silu"] ], "sfLayoutB": "linear", "useUnrollLoop2xForMma": false, @@ -409,7 +413,8 @@ "routeAct": "tma", "fusedAct,eltwiseActType": [ [true, "none"], - [false, "relu2"] + [false, "relu2"], + [false, "silu"] ], "usePerTokenSfB": true, "useUnrollLoop2xForMma": [true, false], @@ -431,7 +436,8 @@ "routeAct": "tma", "fusedAct,eltwiseActType": [ [true, "none"], - [false, "relu2"] + [false, "relu2"], + [false, "silu"] ], "usePerTokenSfB": true, "numRegsPerThreadNonEpilogueWarp": 56, diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d33e0cdffe4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb7cecb5f5aca5eb6d15c9a38332e1ffd6e20ecade5be5915ba4e9fa4354b7e +size 638826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0976285a912..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb1fd74f68a4fcb5d81622999e54c5401510c42c866c8dcd11817b895172dc53 -size 615997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8495a6d3fe4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea382dc47c588105057f170e83dbff54e94f056d89083cd49b28bd100f52dec9 +size 655208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c9e0cb6d739..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccf78ca00c241d696da6de81b5432fa512707aaf389ae48e0d042fd1ec482b2d -size 632430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..124cda881cc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c51fe565dfdb33ed64362558318a9f9a8146c69535f2112dd1e14e95545528be +size 578533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f8d1126af2b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cfe9effc7a2a218f7c745e6c4d0440776b681ffd44a07548c0d22bfce9102af +size 451241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4bdeadc5282..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:45a0a537039390c5b789006c26a757de70bbdb5df05e42b799c60e49201da3fc -size 579287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 44b7fe50050..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eeec6ae6a67c7278dde7dede32b471afc97f4cd5622286009a12e6b472f07793 -size 451995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0e10d3f129f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ba6731622f1b911ef1c2afa58ffb06eeb823e0178ad8f67bb08c3de542bfb1f +size 594077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..41d79ec830b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73958a14b75fe6e61f5dbdaa93e1e68daaaf6a8d07cd1b66987daa71e9ed38a7 +size 471175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f7930eb0b5c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:915ab441627dc42851d991becf9995a166b9c53c08c5445ab7c3a72fa304a5aa -size 594831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1040d856cca..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:41de4188d47a261871b5886acb9f9e9dec3cc37fbbf352d6a5f7a657cd0eb5d4 -size 471929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f321dd4bf0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a9569f9852014cd18a9f44b57ac43cb07977fdb0d2623be567f45845276da76 +size 601375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e7cd2fcfdc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad514fd908d4db6c3654e74f438b97d522788dce1216f0d9e13af0c325d372b +size 466879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 14c55cc2cee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd920526c0f3a4c4cb2b378ec5ca1df5d467a1a24c2f5cf568e809ec95599287 -size 598231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index bf59881c1a9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6098363f9f22fc8ea7771456994fda24aaa0ac16ba53c42755271ea36f03ffdd -size 467683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7e649bae518 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3349134fa1c86d6d798ef3e3e60fafbba5f79afb67904f7081655652c9c455c +size 629006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..54f158e3d15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1554a195a73311ba4df80c2d9cf6bf5f6b13c7873f0fbc5c53d537c531efe2f +size 490811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0e8ec984d4d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa9d2ae4389fad4928e80f500de6f126d17bba0509324cb10338283aa628eaf1 -size 624776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d7dc73b03b5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8a1dd1587a7f9e291bdd2900b1abc1faac73d7b2ec4a740a545f2737524fefc -size 491613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fa95a22a560 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca4c12e592653e3d11821e665b0ecd24cdc470a317453677e08e7130ae235c0b +size 655950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1e00fa5e1f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18d6b5682f282804745e0ab137f3d220d03ea9c54609f605307c40887dfff336 -size 636082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..21870ff6ffc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42b2b33180920229a9e5d243bf497005a96a3eb7bc0cf1e5bb88279b687b10db +size 585193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..76261c176a7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6dae14cf5f65b8e8300d99ef6d58150d7376a38c978746d4ffd8c47ad72686a +size 457211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1d4046bba1b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea621f03e28405db0b937e2ca88d8ad0b86b372f7b594fa0a250fd60e1455602 -size 585947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 662c10402a6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cfb3b766e3260d8a80db56c9dcc14bc3d04131bbdd457d1458e8121988efc870 -size 457963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b47956b691d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b45d92fb9e725319e60f06e6c4ced9713d8f1f955cfb867f08b87c00465eb5b +size 610703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e94dace0fc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f85a0c60728a826bd5d0cc58f0aaf046e7aff635b3c38b205fcb3e375e31e2 +size 477145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 28250f24621..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34624008e9023db6d83888c7209fbb7313752ae3e39ad26c33b88b57225229b3 -size 610667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5db9479afe8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:84762d89de631a07c2a41c45db4393c89ed01e2bd2ad9cb25f857461ae21fa07 -size 477899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c02ba172fcb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb841ecd80a33d1c20e75bed5168d357d164afd80edafe34b3277ef08ead1d91 +size 607047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..02fbc98d02a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8ab776880d42f7e1f08950e844e0919ad79e553c82911edbccde83d1d944515 +size 473343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c3169457da6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:acaa0155353f5b04d7e5e329b18527b1fe35961b31955fc1f3965c46c6c50abe -size 603903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 855ef7a62a9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20d2ef5c80a365ecf88aaa9b287b42d3e29afdb4be9f1ce3e529368fc73f9842 -size 473355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..972d0cac9a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86f9463b72c9c4aec4a8a32a23e17411e1a9fe7b94e923ad28062177a752b004 +size 642030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..744573583ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e123b5c316a726057c256476bc3a138c9dae9e71cbfe1c480f94b6b6766e92f1 +size 497273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e3a44e79bc3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:533d0d814b32c4cc0d05c8aaf5eb8e413e1dd221e98462d15518c5148ecf59d9 -size 641748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3c06955bb9d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:592494af114e214a1cab7168d6abae7c9a6b7d16408cec5ba688f84afc2b827d -size 497287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5f5be74385e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cd64ea97d93276a40bbe50b0a52d416e8968d29de744c6980ce2d67ca3792d3 +size 613855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 308582ac8f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:061c93467482bf760530264844132b9f355d5624302322ba0c4946680de3ce73 -size 611551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d8a36683a19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a34b73c0b789153b16b7ae698fe45fe93cdd3b4da14226e529144142ffae238 +size 648888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0398cb6bb2a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e45ad66a81365cb180108238853245a75e03e5de187faf3c804a1cc8f576093 -size 649344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5d2f11d57e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6cfc7b90e119c99c6de710ed96f2fe4b8c754d75e10cbc64b62611e114859c +size 573643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2b2785c24d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2887c5c37527734d2dda471a8fc707409546fee53da37cbea3b48d91bc0ddcb +size 446253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c1530897592..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42fb1c37b79effa0dd232d0cf7641ef88d8bd55e4b34ebf164b471a444a0b54d -size 574397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 71bc93969db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8bfba5dc16103504d9cf8f69a04c4361408b4cf257f27d26f5dbd922eee0c79b -size 447005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..483953fe770 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae4046de1849d5198fb24b40b88bde8596e9c4928e61d961349ab0071c7e138f +size 590617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1cdbfd65233 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f443d316764083178f82bd8f098ab19d0ed8c88ab7efab366cc27a96ff8518d +size 466237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 10456f75aad..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d51aa778296d937bd180895b23830225e69453b09687a9a98be480a46fc875e -size 591371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index a7b33515028..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94dd9f4b5e4d3506cf2315c6be9c564d0978089ab8108baaeaf243e779f0817b -size 466989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..c86a1762bba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e87c349ca7a317ca04425804d09589ca39bc1b67c81dab90d379abfc22d661f +size 418419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 3e968240a2c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51c4bc95ac4d760b25978c5e0497d2d3df79b540dff4a30e28896f7485dce5e1 -size 418383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..cfc20d04aec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5126a081090b8bff88cccad0246de9b365ce707c286085470059371369bc71db +size 560421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ff6c1be09e3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11a02b2d1a372b97ab1e115f612d623342e0a077c2cf65bd8e0f431b271f229 +size 428097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..45c9d0ea0f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30825da13af475de7c4adb1b7cae8bddeaad2931c90d9b7c7d6da813bcdf894 +size 656036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..88397f1ce5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8438c3ede14e5b61bf4d674d4d27baf0755ae39971a6f20d99b505b23961196 +size 657172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..535d2930910 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba4785741a4320bb0ee42863b02950efc9ad329f0dd0c365399cca6f8914eae0 +size 658356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 946cc032fb3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77bef45dfdd275181aecde186b7c24593f001df654982b0bf8f5961ca71b5df2 -size 561175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 85ecbef5a4b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0778845ba447848e8eb44f9543db1e3cbc6e86ad82141f04a400c0c8aabc583d -size 428851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 43bfa4f7f3c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef1ece9b60a2dd08e3a9b38e883df5f5ace71c5e834f1a598a160bb6e87095c2 -size 656790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5b86d60e468..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:712abc472a9b489d4cbb6bfcb165060723388052d97a0a07e222aba7c91758e7 -size 657924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4da7c49629f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPdx3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52072753e812e2a3a68f32abb9080f348877c17765edcce0bf69e2441fbb49e6 -size 659108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..2766e72e243 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1baa59ac004c9765dbb3b915ad805431b85176d7ca50f8ed2db4a66172a265bb +size 441559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 24bee6053a6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:65fdda458d21c964ca4c05f8bf90b0bd73149aaa898c852ad905940feaf8ff49 -size 442313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..383ca9a550a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a67a0652e835f788a483a2bdf1f8a0909d49f7f66110d3d2c8d7a2d306b6f86f +size 575621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..39305e5c097 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:385b4875668f922d268d22a2a7402d3aac2ea17e848286f37471cd9a4ee02078 +size 451239 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d9d8760183c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b512a7e4b9f7eb038d00a44e29e0a77da2b33aef0c38c05eb629487f17038a85 -size 576373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2b3794c1dd4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6ab7fee3a722bdf870ac55a08ec54ec1461ad1ca3a023cff738c60266200f76e -size 451993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..02b51367afd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c4826e716e12eb26aee266859c8cb4e21a21d7c4fd40ea28c3578f79a150da8 +size 545931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..44d2d44cf36 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14208d8916b0c1d5ff22bc644315abab7ea8f0c9626f5343c873cdf911c67591 +size 550421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..050728fce70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d702a9b1c32f7ab2509e4da0211572f39cd39f45f13fe16bfc1e6af31d23457e +size 432503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..c40f22db259 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2689b9b5e2bdc335a63b578a4392a758690774f51007d9957c2e38c178f88f +size 436893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 19158427c41..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fbf3f3db4becafb009519aed1c4f740bc42d9d37929b14ee05a6696e6ff98e2e -size 546685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index a59825d8189..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0fc554c75bde4b75231aad2d565271ea84b824ab5bfd6484053355fae59c173 -size 551175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 9444917441c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de19256937a756fe7007aa526a1bd9ef8b296c131c35475fcabc1f684dabba27 -size 433255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6d6617e5293..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3bc81488c7ee1284f49a4b0aba3382a7ccc1efeb5d15dd3bf1d10bbc30d4998b -size 437647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..fc26fb52b28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9806419d50168d5a372e88e3ee3a9e80b57893c90d6a2fc7014798871c9c2fab +size 560143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..b107e9b98a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e161536721381b6631b53b45548aa2fe409c6cab25114c1344a19735d55e593 +size 565669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..3ca068577e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d0480602f1f043d53377e4bc0d772b6d782c9ca0215f6729335663f2f17aa32 +size 447307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..1acbba931fd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a4bd122d51aa8afc1f34da2e81dd2c5dccac2a20e6da1b9be4c8b092fa662fd +size 451253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 76dc52a3d6a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edf8af121cfc9f4476b98564b81d8b1f22963ca3246f070f4f8ecd054b3f8d1e -size 560897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index e7071e7aaef..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b5cfa916c2e6186d9bf4c24bc67d95d1fc03e9ad0bbc34719b4b0ea52379450 -size 566423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 20bfdba29ce..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:625081524078be6791e4be90f592c8942fc5aad833d25ee2bbb69dbe4a7bb841 -size 447271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 086ff51d788..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9bb34bf536570a2dc5e0fbfbd386f432570723e7a7126ca2858dbc706d13c301 -size 452007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..75fe6adad69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0533b165b3dcd375b077ccad166cfe191e9c07e5de37a66a2d34ebe30643b3a0 +size 561719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..f7070e0b0d3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2aba3dff5e085b85c767a08374306230e878663bb4401dedfdc5f5128d30262 +size 566159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..4f6b770770e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f60fe2b78985c0723fdc42b9d6e4592df21260f91786e451630879aa481419b +size 440149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..48ff34067c3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93344a885c72e62a47a53035e5bd9233931c8d0c57b6d16c3843f073e2c5fbd9 +size 443751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 8136363cb77..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2c38c951255c24d5268a16e8e8fd3de0f3c8f8c1177e1bdfc717c10528ac780 -size 561683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6dd495964c1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f8d54d46a848333f81448f21456ed75b253ba5838c1b13d52bbae906d748156 -size 566123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 7f05056eb5e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eff08783d0d4963e287cb02ff762450cd85aa1452c5eac81c889537e595aea9d -size 440903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 98be6f7195c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d971a421a701c23c89029cf3532dd251b1c18d3fe1790d9333d21f1dbf8791aa -size 444503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..505c0ca8e9a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e949af7809e67611eb8dc676e02d5c59c7a45cc2751fc5d21cc5d52be69dc09d +size 575141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..2efa9c52c6a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a84c1139d849c7fb0f85b01df09f274d910d21d9ee6fe49bedfcc435491f0181 +size 580667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..691c3609ece --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:207936bac6e4330977544817dd122db76e0a6c863f62eedd6be6b206c4cbe579 +size 454163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..31cb454ec50 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfa3a8b3e32be0fd3860b899a87e7da8bda61faaa08096d094815ea06800af5 +size 458111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 16d793eac05..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1dbc6cca1ab7d2338b4e89d3c702757ec7808d1115c093fee8ac5d148c3de883 -size 575895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index d7b86f9b8db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d0595c5715107c1d64b54785ea667619a29b360d16bf93708d78a9ac85c228c0 -size 581419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index d585b9ecab7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:392704fec72ec707b11591aae55dd51dc494bd375f8f050695124f980ec1ddb5 -size 454127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 3abe01c45ac..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:44b8fe800e480173748c51228d6a0f198374c8ebfd30e90644ab5ef7810c2ec9 -size 458863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..50d7033327c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd12e6728e365bb6fcb0071577e536d6d6ab338e93b120fa039a7cbffff0196a +size 570451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..6d89b190633 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e58b8dbcb022dbd6bf2bf6716a706983ec2232e27f5cb8bb8cfe3e4b7aa86f7f +size 574891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..73768530b25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4bde05d35b5f839023ae6ac1dec6164832dfde41dee98a3bcdb1d4b95ed3f54 +size 448043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..4bf3397a2b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9c9f7454e677cd7475f9a11a04d72b61013c45be3b5cc28fe5406df8cd15c00 +size 452433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index bc2bca9c7fd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:434f2c9bff5d09584988fdc7767040ebfaf66b7d3adde93deb40128eec445c22 -size 570415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 3bcfb977daf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:455a0d697086956d618b47bcbb4214de137b0370287ba59f3d8b8be51b7c6aea -size 574855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 1f1558be1a5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92aec9b1af3a7df113d4057de4e5be676ef2ccd38db45c31b4a1b05d471a2a97 -size 448795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 0e8f067afbe..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8708594caa6e5c11c990c6bd699ed96eb5bc6d660f6154613490844ab9f9e3a3 -size 453187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..751e98eaf77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce1b585b90a35d45292cf55b41cc1f796677a4bc7f6652f3449298c7d4c6365 +size 583873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..516381bc46b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba92d23642f5776c85397e06f3f7fba156b2a2d8f6a3c24df678d13fc55a387d +size 588609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..2cb5488c032 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8051a6495f08037e8342334643b8486594d93891f40cacbdda24f39d2ae8c2e +size 462895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..514709977dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c447a5abb56b99fc57cdfd99f73480adce6555d18a4e02ab37102d170e5be1c2 +size 466793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 7ec692847b4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62d59babf68d35ae157095f36185a75df80643322a73f1b24226a061a5f5f6a7 -size 584627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index fbfcbc9ba7b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f137fec086b93d5f00d7d526bacdf86e6c2eae4ad2c4067ef12213aee8b0c3e -size 589363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 266f1f84f39..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b39631e5dc772df53170af3b2c6aa86a8ad77f0c84bf30303d64c2312cf35f5 -size 462859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 295ae4d6e57..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75bb517365952e4bd92e485e45602d0f47cb4768d69c47fb07006f7dbd9812bb -size 467547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..0ffb0071b44 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2032b826174b423372454468569656bb5b04f3df9dda3f62bc1d03f4346e4345 +size 541979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..757198837b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a8846b5b980a6bea8f80bc447afe5b52bdf0e83691a7e1be268538b6531f598 +size 546419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..794debc5643 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff8688391ce02c6661473b3af8612a94dc06dcf0d8145d25528887de885c42d0 +size 428501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..f9bc98d916e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6776f4136629119f937dba02c948be81d7ce06236a5e8bbe237312e2a5d2da67 +size 432891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index cc4629fd381..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f810812c255bf32491639d1db9545ffa22a954e64cb27564389444c922fb5250 -size 542733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 323167d5cfa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:afeb850496642212d0a128c8d97823e004627dcb5559c62641ee3d243a73fcea -size 547173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index e4ac36abca0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d1bc66ef28cbfdfe41aa625a615a0700004b5fb1b8884c5bb14563f7377c63fa -size 429253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index cde55a40abc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:56796506e7651d1095356b5114ad21e9bdadb58e238f563cf8cdba8ca3ca99ca -size 433645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..848fb0e018d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:104097a5c4e9ba892135a0cd5dd5c3233ed6640083ab53c549ebf3966d8ce9c5 +size 556191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..957f6250772 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08aa2aca8a0c4cf07ae157649888e3cd84c294613153c2545e76e74fc592cb9f +size 561717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..42489bad637 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f1f673237c5dd55736cb624eba8fc5643711c497baaec5297c213c84ffab06 +size 442565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..bff68524168 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:244f2554fde577fc09f9e8dd214bb42739ca4bd558a63382da91083203bc8d7c +size 447251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 1aa9d4c7f08..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ae2b32383b2631977bb28a0397d3827f9ef067a04557f28aa64ab9f1523977b -size 556155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 76c8c774620..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6efcbc01fba3243348e8a4759e68a4044d327916d88394440de3c4e5ae3cfc55 -size 561681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index e4b0c75cad2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b994bc5aa6f02d20a47db3e0fb246cd0d3aa6b3d547684eaf32167d30c91980f -size 443317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 243491bf7a6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fcb8441b92eae6ff2e2871afa77f1041668c6f090f53626e4902d4df86cef269 -size 448005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..f049971c8f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24edd832608c1f0c08a35668093d38d5304fa257647fb2bc85a4a478e8e6a661 +size 625402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..22e54ec1037 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381596ecdc74c5c434f554f82d51440b51f3b22b68275173f544539212787465 +size 629842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..14137471ef7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b6052936ad6f25c8871a7c5c7a0b0b2d46de267135c430b48b99651f3c4c8c9 +size 506349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..f94e6fedd32 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66265471695bdc5c532aa59176ca8accc26b16e5ef9e7f296ee9aa49bcea63b5 +size 510739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index d2301a839e2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6c8ab4698f921ff7fdc93133e1f23f1f8364ff4133645bddce5aedc4ce703ede -size 626156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 85dbee0101a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6c073ee3dce02df219794100c4efefc0d34c197a41a4e592ea187d15214b7e8 -size 630596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index 52b38483f21..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1b6cc69a38e18748e826c04c34f993c95d26acf4a5fc48f2ddd6f6e6ca024f1 -size 507101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index b2ef9f684b3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:711a6742507e4650f3a8edfc5bae1fc7177f343e0da79c2b7ea971b82dd587ae -size 511493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..a6774ef56f5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9423f8748d320d20e31107cec05c8ba040d6f3e0417a4dc241344c49721befb +size 644548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..50e51358a08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d6404619edd0b5744749e2a9bfd12d39436675c5fee4f6ad80e6f635b5037f3 +size 650074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..edb4eba4950 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5956a9a0c52029acf9691bc3f80d51791ad6a280d44408d62f173b77c41729ad +size 525937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..762b3123e41 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99b2c3c4272d6eb463bbf9b35c4f12b973e2e33bb131f0545a29de78a150806 +size 529885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index dcdb2c29cdb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8014b58215a9231a508e4f1759a3c42707a8de2e69aeb6947ebf15b79a3b467d -size 644512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6139b317e6d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:24225dcb9c8679eee8f5fd1ab8a48315b0e1d2bcfa9713a7b75e2f3a8a4cefef -size 650038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp deleted file mode 100644 index f93c8544847..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9878ea41d673bdbe3f0f0143f0dc7bad8fa0408f8c5f23bb62c8ce1118fdead1 -size 525901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp deleted file mode 100644 index f319e842023..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05d616e8dd95ff608b3bc4e712bae99b02c3c89416a0f0b1301d6a12c1dc98a4 -size 530637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c2da96ed734 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b22b64a96ba1404954adcc4482641c6e1eace8a49d85de1a116840296d94ac7 +size 670292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e564639448c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:094dcf85043c91b0dbf0198243f9cdf75c904bf4d3a85537654f82c33bde2bd4 +size 538461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b0e0f6542f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06ce55e1ffe8d7a4f81fab7b246297271244a19d0e2c1b092d05f54ff33eb31f -size 670158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e65f7d24295..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ac195f1790cb5d78e192a715c9655f90c5a6dca6a456195f5d6f50fad8e440f -size 538325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..acef1f647a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b821d5f9f6abdcc3c1493f5ecc3df02ba2f834b63d739424a57d64be3bd128a +size 484919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 672a345e6c4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:306013dd81e74a9ba24a4bf4285a9906eef6ac24088b4b151888b1331d41ac11 -size 485771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1916cadf160 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20d7bf9e45059e2a40ef01e7aaa9715a78fb3cb4eedcffeb66798425e7481e47 +size 693138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ad1a0b3cdc3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4b2478f90513648bed595aa9cb6dd8c617b7bed2374fac5dad8376c8f29f96d +size 556323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 32e3463d787..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4930edd6e2c0b67e22dbfcbe27d3610e951085edd3a0a0bd4bee21c0ab0901fd -size 693792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index dc3c83ea3f0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:121e71dcdf3179b34971c88fea459903a537756978362deb8e8569a30350b9a7 -size 555399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fec4e7e4d71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:183f554afcaa22c7a5d989f935a2e8272675f490ee7806e8533f563ab79fa4ff +size 496467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4fd6522ea07..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0cd6c1e7705c205c7f143c0f69c263f6081eee9a3ac9f38aafa6c4113b89f4db -size 496481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d652c6b80cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc9e9f5be4b80505e9da1e9709fbab40f31061eed7532137392fbb6ae6cd591 +size 547741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..644bc895d12 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23b44120b7a5dca80a51b071cc4bdb4d95f9cb7933d0502fdcc0ec3c4bdd700f +size 420893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2abfc94361f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6783a13197b6c40eb931714fe0a90e42980658740f399d5745820b569e3f2ee -size 548495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 78aca623a61..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bca579bfb1ddf3ed3d127a36bc284020aaa5b872edd3a34569b5d6dfc7de7553 -size 421647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..55757488e21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb385459078b9ec461849292a056f55b3edb91a265acfd0c66ad6cd386ea3f97 +size 571475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4cc7d21239d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c12243ede1fbb72fb7b6220c1b83c23b1324df849050114301e4fa6d23ef0fb +size 438755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 97fb6f78e9b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e1d99f181da39a8fd23317fe12b37a4e2b3d5108307a4cee2305d35036eb68e -size 572227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 03d62276efe..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef7b37e3fc75809ea83c14c4c2093dcd0e407dba32c1802944da4cb529d18add -size 439509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f6faf5183d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc903e22c31ab42f40c8d0e1daf90283830742d8c9650ae76206e2c95eb4d3d +size 412987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..eca4b585c51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a16d86de14bf5479022902d993841f41961c8b586f554c1c5ea06d7be5a03bc +size 325705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 89b5e69ff5a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ac54bf2852e9021e8ca6491c05e1f458843ca7fd300ddf878ec4569a3436506 -size 413741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 27fa4f535d7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2264843de412c18fda7066593a70759af5b28c522bace012c56b8acf6b851ca -size 326459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..77ff514606f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50bf1a9d178db2ecc37959f02e7ed912ea5146eb8d537acf8a3b962af39b222c +size 427595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..facb752b6ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0031479beba178e60bd7332ac017e9548852853d7231f9623a004daf93652500 +size 339473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 60b47774104..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa6608351afa56eea924104de65fd93e51dd36b6c865b1e68c97266e72247a5d -size 427559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d24bb99023c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:60357924aa009f3945ef537e2256eb954fdd073ece44173e6d34f8bbc0a091ff -size 340227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ed3c98c2abf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcada9c0203c23939531fcec86f4b9002bc5655fa44928ffbbcf5e3f118befc7 +size 513483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c8d5c008093..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:29ba655439149f2572754c900d5d2f7003460dd0b8d46a31650b682e4481f2de -size 513547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e7e8b7219fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:184571b0ecf23e93c4fbe441c1ccb69281b072354501bfb64be8fbc2bfbb13be +size 524883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3779ecfcc94..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ced5de0a8a7a0f914c5cb36089d820d4b71f581d1eade3cf45e5b44eb14a8e18 -size 524847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..13605e3dbfc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd61e4fc47ba3a93cb31e7fc8594fc96432660dc314033ad144ba5edf6443fb6 +size 484829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e11c7941df0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4a8cd3b1898e06315e0ad579e9397e686ae8b501214a26ebee05321245afca59 -size 485583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3c0732362d4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7923186a911cabfb881fb348a4e42f731db4a8f8fe5cb7967a0531b3d8b265d +size 497067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index fe84f293b57..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_eW8_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fae903a15a99aa66c11140c8c9471fd4377e3edde3302e4a9dc9e3f4ec802b1a -size 497031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c3bc7f4f5ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa30d15989c3c039c751aad16c9451f81eb6e7ec96a0b40cc7251401b0fcccc +size 564021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ca4f56f6e20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b7f409c2ab41ec061e3c184c31afd554274921715985eaea515f79409e25e3 +size 438407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4860cadc476..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a378ddaa3b3de8769498db71494c8f75c0d946692fa39838c721fad00894bd6f -size 564775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2374bd7d53a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c003cb50ada5f5b6e5c336d5640539218a2a59a0cc8439eff9b78ec10d9c7d1 -size 439159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..56fda87e615 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2470c7066d54b0d8d51b4268b58270241e3dee5d9b71c0f36cad76cb65fc5006 +size 587755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d6bbf444bae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f24387b2703f8bbe792a3707dc2f3f4af0fe30472600a74a1657970602d19f5 +size 455479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7dd9fcb563f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9160577578207878e3dfb0e0e1468258824ace9337878b6f97ccb5284c12102 -size 588507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 84e14e507d3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0b85f47cbd555e3420322d19cedbbe49a52886d070de9d7f1c303205ced14904 -size 456233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a50baeba714 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f436028f7735e126e206907407b1e3589b4f22916465b9fc43156a5c2a354b7 +size 423051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e3053901444 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b9c3baa06a2a23cb91d0e27ebfbdcca8d41923faf445c346e4e9e0015104d83 +size 328961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 81c37edf57c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c89b16b081c425bc1cfe97098f23eb0c3862a9a522558a84eda7d0d0c2e14639 -size 423805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1d8bbcaf0a8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:242db0aaea906fd2995acff603077bfff586354de4d673a0281b0ed8c7c26c6a -size 328925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..deb410f8e25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf61ebbdba444584c4743a4cb4a44bd738c35c490ed874bde8d4bb6f3300dfb8 +size 436919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8f51a8dd0e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68bb1ed9a09cee0b5175124d572dc24370cbfc52606b441501006ff2c27e259d +size 343567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 06774ddd57b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:483ab5a8d320ddfc600f079c2d39b38947b423b4c86d038aa65c3ac7843fb448 -size 437671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 06c9fd08398..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5c1f09620411a515fff45889e63ae577c5e1d949d54a2ac17445039fa8d773a -size 343531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..227239ba34c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98d5046cee8334a562761ea12156bea4be425a67262e7934c63c644651ddbb06 +size 603783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1d37cb27934 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db62cdf42fcaf916c8c1e72c8f3159763b896d999aba58770ce4ef398f73cf51 +size 472939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6267e7bd5ef..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:805730875ed927d20b321eb3f8706500c370b0998c00ec32dd820a26bedd035a -size 604537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e0670f11c31..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36985fb0924a3ceefc886e232e1f5c275c7e129773195a789124e64f970bbaf7 -size 473693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..de75a6ded35 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54fdb80299c765259fc616dbe46c82a7733607e00c2edcf246eb60658524da4f +size 456253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e0441bef6bd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d576a825a9a19e0b7f8844f7a774468360d501a23e390e2b5fe782c44c4a4996 -size 457795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f5d895c5371 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cae78cbd57159b5298ad284c170bb130d7d8347390262e5ea7ada2aead542e7e +size 628308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bad12c1a3ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97a3370e4e68ee43f14af56ceb154fd03d9c8c8fca8d183e83b66a2bd03a6a02 +size 490013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5089e95b00d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:386f8638a1bc64a557b097af19fddf056275ffc7643c5cf169de4c034505ff2c -size 629060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index be5e6f05f57..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e84136047f6d0ee70c766bb3bdfe7c4223d51cc220b9f57d84195e43dc1333c -size 490767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..89452fd1d0e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5d421133702f270074abad0f35b010daa51037abbbe117987e1da656d7f4989 +size 468591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b15fc82e575..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:43fa257faddf2b183d603b1ca8ef767ab4349380912234cd2ec387732c6ed77a -size 467765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..f571faab398 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd389bc4b88b1e94cef259cd53b614e00063e56fc4489e401f2badc729ef6ead +size 401095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 5faa70fd286..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c3223e4c30c769c9c366a76a295f81090988827f692550220ed2cc927ef2897 -size 401059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dba8cf29e08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e536c8c0ea606ecccc270a346da9c925390af0c6b5c5e6243efb27c905ee5d21 +size 543393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2fc5ac70ef8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24256fe63c95179802d5753314eb546f092658745d75d2ab17ca379b959c6c6 +size 416989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f18e1b318a0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a621dcb89b2a8eda3acebdcd829ab254c5dfbdc5e8e752aaf11772b2381b8151 -size 544147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b47072332f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb33e53b308eb07b36c3770ee1dd6eea2a2be0b50e32699a0e82a5d4c3c1a526 -size 417743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..0d599f11aff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d82128522329d497f87a548f52853e0480d8be81720d54ffe9c137b400a693a7 +size 418315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 15aa2e0fba9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68b9d59191e4599c9454b84f2a462e249b1c335fd2cef863f33961d46caf0fb8 -size 419069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6e0943b301f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:010dadf212a675369cc4ab37f1eb43ad8163397fb6334b8580e16e2291404c9d +size 567521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3e989da3292 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e04820f9bcf6fb1bcd1f00c1f95020dc94a3f00370118b41cc124c3b6e42f5 +size 434901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index aab4bc36681..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:894a87318aec9dd78e6729a636f8443564bd1bac071489429f735e92dded8abb -size 567485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b286ff4e301..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:46809be335670f08b944aa7437c9ccfa8248b3427a596ecbe9b0432e37b5ecda -size 434865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c56047f0e7c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da037a166ae795a52ac88747a02b4ab0ae4a8b63bbdee3d608ffeaa90280d4c3 +size 409577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3041d52ae62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c87d44236856286e0bc3e13ecee9d281d262532dc5c238ef869034f25e52ab79 +size 321703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index dfe1d857ec5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d80509a4bb1d10bbf7626b74613d5d5c5890a7f54a83a757b603a9819ef906f -size 410331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c468287b06b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05ad048e0ef0cdad0bdca45ce435353e30265519ac61e2904c6a51f12c93c1a6 -size 322457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..926d4a33abe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9afa085cc6434916b9d8f4cbcd34611c0e696f63b62c374da53830586cac8c79 +size 424333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..91fa06ee389 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd9cc0e4a1c20022fce87b47b82ed7d57c1da87d6521ed7fc08de0b976ce52b2 +size 336309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0389d95943b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:839e9b381fe946458198d0cccc14baa4016360d244ffc0d09e6be705efa267bf -size 424297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4b816724bf9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac2155e27ae2502c98d5753f0c5f5f1b6857cf9763b6f1d46ab589afdbe28f91 -size 337063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..dd706016169 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c47cac7ea8d5457378a372cc6a6221a1c751af1295da9b6967540db687f782b +size 322581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..cada4db386e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f1eb3f4e5426e09c8b4f6c57e7227e25a88045479fce91024e53e3c76228b4c +size 326737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index ebb27a8404f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3851df60574d506931c37bd8ff606f0c7e2ed255820dde54c0dcc4f141ce105c -size 322545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 5f5f31ead90..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:123259fe45e57e0dec04b267a2ec730d6cb0d89de9c144755a5a203238cf06ac -size 327491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..1c342a47bbf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69834d30ed3e1a5fc0ddd9d7fe62d9d5eb2e195df47588485799c5da7c43cba0 +size 344391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..e6d7e6d52a5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40baa3b185bee27cd06bc24fdeb831145f9df131584f4641658c5034e1dd4b41 +size 348595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 6389365a0c8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94741b8c32133040b8cb9a2a073b57c0f4a11e624945a6699c94492de4641a96 -size 345143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 925a841d645..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92e29bccae576590648c0e1df99f5af18ee23e0c8d8e016888553f709b523753 -size 349349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7a0503663d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59873273d5eb5ed86f6da9ae745ba74af573e6621ad0fb02ee0df4068b37eef2 +size 604507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..049d021aaa4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e70ed9571f1217bfb675f60352fafe26841e1df4adf4893acdfae7dfca1e1a +size 492113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 55c1cf56760..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f084e44f8a0556ef7fb5f150c5f72df1fa0db17d96ca97abd41b520f70f73a1 -size 605261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 29ede6ac07f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:95081dbe757b0863aa19e350dc914875fa0fb41a4869ac8e69ae4da92f7ae3b7 -size 492077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0324e97d2ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9541c7f1d5b0aac46f49bfa6cc3d41217c7827c81488d464b4298412d44aaa01 +size 618918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..920eda17174 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a489d86a4fa05ca775113462911a48eb2e59f3ae7eeabbd538b1cf3845b8812 +size 506523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a4b44863268..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b21c637236e714ffe6825e3c11ccbe1e01d0180268f0f79d4a9e2f19c6b4c3d -size 618882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a8dca57a799..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e6077ff6424273cf25ce5f87c273a0b373325f4f0fc4d1971a67929a3bc7ffd2 -size 507277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d1f049e8000 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fdad7f583014c9cd953d7c443828bf3ad417233e156b261e4f27ff53d2d8beb +size 690644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5c55d823d2e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc5053fce7acd506260484505ac2e4bad43610e69d83591847993d8eb3007e4 +size 663810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0ebbec50745 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07e809dfd556183ae2233823841c7e3952d6556b7c8f482cc00d38d14efad7bf +size 576029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..94a99ad8076 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1358eb84eb5db620efe7e6f6915df505d375212f9fe2c87b7584fb3f007c86b3 +size 550133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7b49fd82b03..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f66f2c17ac99db4a2676e04e3e3e04a8b445ca5129a2276fdd97d1ff9f9d53d6 -size 690608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index dc92363cbba..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:869abdcf598be0990ca8f2e32edafd42f316a6bf603f3e564ac42f836cbc685b -size 664564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 44cbd440247..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa2d221a7139390131b1945a327333ac8168a3d4953c2acb4f388158ffcd4279 -size 575993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4a9e5248fca..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d7ee834af3f1c094e4361208523d1821f96a684aa5aba615131c7a4522801d5 -size 550887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f40074ae726 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:048294570d794dbc2fe8410f3e594d5d74e159cdbc0dfe453f8c14edfa71163b +size 710874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..609a65ce6aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd9cf7301a5e38815dce431a51764de1e448cd500c07b3f4e2f422404d7ceb7 +size 684188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2f7f24760a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aee2996f8e970dd49b94b532388676d6ce52928be3ba05d5e724b9d610f3b8c4 +size 595965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1b4a248c1de --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3becf5119591e3a99ec0d3d4172c96b138f0bd162e35ed53d28c3593236ee23 +size 568933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5a27fd69a27..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01b629ef307a8bbe27c7d52ed158cc3bdfe5aff0a073c401eda24c9116989442 -size 711628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7e36c7c51b2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2926bf1545cd9cfa1d9dea1a3c10e39f8267c5105e72de0f2b9f146f3fe43e1c -size 684152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 54a9ab618ba..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cefb379b911cd878f67606ee0b12bd5e74e1d8600188c0d4614e71e3b2cb298 -size 596717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d2d04781161..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fe67d6460fc71c13b0dcb04396105392f169dd88d5d56a63d8b4829b4a1fa92 -size 569687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..99466b4b16c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7b750d3c2f8b60ca9f718781a6b33465ac2d9f099c70d70853d53a2b41f163 +size 631296 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..47c89830046 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4185234b71c8ae08c9b7c47d268402634566e9db4dd473bede30b9929f180b9f +size 512093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index bd0e8be7cf7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf77885d43e8b96a7ce460bbac93fdce8adb83249e58455899b6fb99cb60ff69 -size 632050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 91a41e5375c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07ee746d15d1a061c417abaf3d69540b6b6564337c38703b2a20b6dc1eceafd9 -size 512057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fdfd764928a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4b47db4d98c594df52c416ed8fcb7fb5676de882dd7e136bf4ca532f87a03d +size 645706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..363c842ef28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39ebb14b6ded26421f4c05b448248438892e16b3a6f09ad801a69d0313377cee +size 527243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b111cce49dd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:862631f3d85820484bc030aad997c3ea7a66f5464721d4c369e28ab6762776e8 -size 645670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a0fc0d9c8aa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c33b56483a04e92fbd0d1a62e555a31549a873d247c071f8f1311ac9fdfcc77 -size 527997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9d8568b9afe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2006854267fff583721caf7a334c66eda8f712dccabca04abdf20a6567b47176 +size 715952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0951cb951d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c5a74cf4eb94f1f627621c8709fa4d9bb22a73f2ee472e1b7179f86bba73b92 +size 667904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9cf93d11d45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d6131d352b37a2719cbbab63e56b985445a45a8ee64b9fe0b0bf69a92d97bcf +size 597539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..59164d063a4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26b52fe5b0ea75cafa85f6d8bb868a2c3dd52ba23315668c154ae479954917a +size 554425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index abbf4795888..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1bb7bc4b314785e9aa19284c4e3ee62b9c3ff81e72850255ac70f13757d0f078 -size 715916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8d48f093a0c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:90457256815cbf0c03f248054dd9f56a0370c440c45c88ea3694c5664201502c -size 668658 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 465a5addc28..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f69b4fe6900daad0c0cf146110e0ec800bdf959468b6922bca1c47789d4774ef -size 597503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 28285b0d8c3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:513a251256dba6ace15452b209c80a2c3dadb58c63e7b7707f86cf6eeaa694a2 -size 555179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..75c949f6193 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74a2a9d04b64382ceb4fb471aedd3d1deeb57925bc75ec64cb73c74a426f6cf9 +size 737120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..60fdee7cd67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfaeae41382de40451e48b96ae8ca97c52c38c7de9fa316ec34f339ce9d0dc9e +size 688234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a6edee7d3ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c34c182eaaebef7903ba2a0d7535de8d5dfdf04a19565a679c3935ef1c94341 +size 617523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..768c9692131 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18960f7b7b2da00507a72e9c857d171f317322c1c4e8555a7c1e0b9159aa1370 +size 573225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b272107600a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:142f6f33580cdd5ce8c0179e4ae0111ef9c83e7dcb869a88653b775292ae393d -size 737874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c0ff4100c1a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8d8a49e75e0eacae364da7c947926ee47d758de78fa7b35aacf22d6e2563c632 -size 688198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f3a53a1fb3e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:396baddba05b0082ab205b2feaf7d22ab90fc5e4711cb6498bdbba3dea742c05 -size 618278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e04d2683aa9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1c3651ddd22616226f7794860a7b8daea4dadadc5253eb25bc6cc85cb33bf1f5 -size 573979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..00db79514cb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:992c1ee34ae372c1d27ce926b9848c5ac69cfc100ec1ad81319a87e564f3f8be +size 679988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..476694257f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41be5191b163aaf7f0aa8cbd80d354d55232321dd5f99b5e1411af464d7b651a +size 551807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index dedf72b9b2a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c478fe5922e31a6d484a0bb23dc5b758f912e1bd1dea6a2f598a504c2c8172c6 -size 680742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f6601f4863d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18835b7ccc3f0490506acdc99992e0875e4e2eba78deb0592066126d6ec456e5 -size 551771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c8e583fb55b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e27079f3c78ce1427cbfd5f1b9fd664148f1d5fc2c9dd2e4863fa37a1a9ca8f6 +size 694546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6bdd6aff717 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da93cc615bda2667dd5c344a005a16fd013f321763fc9e07ba0b1e329dcfc03c +size 567005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 73708c60662..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a31a24064b149db05f4d26b3feda2da54258a267a121520026c047f408c1a5fe -size 694510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b84605b70b4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dac9cef53d6ddc1b9edf2ac0cb6ba0c368c9e79447f66ec5865129326558f596 -size 567759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..768781f67a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebc676587758d663204b4f03b880fb1f35834cf97d942061cd3754989521d513 +size 772044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e5b18a53346 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39d6fd44f54d25c009885c1d9089819c22eb546c0325794bea84e67cb44d470c +size 676340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2463b361013 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16a9cdbbce87b4374fb7e5c381be6afca3636f12a6098993ea536eb0d171b85e +size 639670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..70a5fb10b6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d47b7c93766992203a192790dcfa938e9d84e4e2286955ed60421879add91a79 +size 562763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a18bd0a28c4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e30ff779f82d52a5e90c9f7b3904f9a3f41a5f8c11fa19c8278913c658ec825b -size 772798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7252d97d7cf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a4ddc7d03c85c7878d79d97a80cc0d6686e5189ce85ac6452623931253ac0e7 -size 677094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 281fb80f8e1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37a88e1064adedc9c2b3978b2aedf581a9c92c6eaec38d395f86e0d4fad933ff -size 639634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index eb157d00e7e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:243b092a6acff3d008291548c9edbef693139fbede884d657e9724e55cb6f082 -size 563515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fd9339131a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c526c2880b737af859ad88a894d78f681cb3761ef55e4250e216c48368f21a +size 794790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7f03b7d158e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ec7a573482995bcc740ef2be8dd996fd94594d37155627ca103319c96b957f +size 696720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5b915d3334d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bcca6bd5f775e4b52d4e6c6eba94e0af495475dfae3955629518f34f2a4d52c +size 659606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f25e37b67a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfeca47d5e2a7c67c35cae78f589fb69e1ea3dffe935f5779ff1b0cbf964b869 +size 581563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 675f27dda19..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2144904799aad2ee207efd5c0ad11188877689d7ec5c526ba7c450a2f410df7 -size 795544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 108fb56f041..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6c33e935421f6a917b1fca804fcc26e536122b81cb10ef49de7992031b6d043a -size 696684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 838e5dff8b0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4df831a272980963e10674702e2f6b9ec6ee51ca946d901c6c58560e91d9ec5 -size 660358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0714cca8c61..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82ab5b65c822799ef35d6fe69e80be180a6af880f7eee18d40eff435c0eb7844 -size 582315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d5b52bc55dc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5720a864a077a510c0a762b9d098cacc4917b240bc4b4c511dedb5f9d2387bd8 +size 591527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..46e515a7636 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793c107df8bf3e78d6d9bccea9d5d971f94f0f07ecd01b99580eb35200676ff4 +size 481599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5c7cf2c3283..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8f24f7567712a2a641f846a370a406590a8ec28afe41d11b7e3fb46fd4293d5 -size 592279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index c29d11eb3e7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b63f906045b1c23b6f6097592d06106e72308899a84e1d9c77611188496384ce -size 481563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..758747482b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a711b2a272d3233915c6fc7859d9244c506a7294c88e98264e4f901e3ac2db1c +size 606675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9668f8a2769 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c4119d5da494c33665f17d0c0d5037f082d8966bf1844a82c61d6a77932b0b +size 496107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2fb33692714..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edc1f6569cbdc3774c98662e00a57e91296c19f7005661a99596a2618e9a6a54 -size 606639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index c46f273d974..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85ce2e12914c0c4262de433725bea1b9dad7fd72eb89760e9b9835280044965b -size 496861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7e7723f224d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:896d3995d814b21458f845a8e897cd03c6b2db046d81ba2a8a63cc3567668766 +size 678206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2b4b80d6581 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad56204c2d0b3a5d370e4f067b62626d04a783c2ad9ed0795d7c475b99a66e57 +size 660646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ac24c3c42bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:948f700803f4ffc735f978635c2cf1c82baee62efb729bee2250e7c34670a8f0 +size 565121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ef280860660 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ed5ce09bc60d601a90dfe31e41459545baede025c5a3286db4d19db9770b8a +size 547857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9c3ff795e68..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0bc5e1383545a44af16b026334d359d3bc0eaedd856ba61ac8bbb6d12fe978b -size 678170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0fc71737708..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1804bdb2f2e3d9fe2aecbf5dab5091db45d3b806bbe23821facf27835c0c2643 -size 661400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4177c9bfe44..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1c51d507fedaaa76347c1bc861b3ac63ddc9ae7aea49312460abee52a0485be5 -size 565085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2a0d2169656..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e004564dcd7b10dc0b61235c9a5696315682c6c48c8d7e8af13481fe122662f -size 548611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bfb6cc4c44a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0412338d49145224b7663d9eee8a6774224777e03fbdadbf7ea78d02d5772de +size 700262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fc78bd99c8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8b93acbfbedac1fce078afaf10c5fff787612b7704b6714ee8f186d22915c7 +size 680236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5afcf2c44d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cddd2e8a0297b8fdb376583ecc4dfe24d8c31c758b79b1bda019bd332996aff7 +size 585155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..04785b1da22 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe7cdb800e47fecb053fe12ddb46e023782ed52866f4721d8069864185400065 +size 565869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b90d12d7d1c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a73c2fafe069d9c3fdd55f90c58e1fa20bae7f2b790a574f0484decfc1f128b -size 701016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index a821888a2b0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6c2f97a55f3d0fd1a5fff75e4eecde1755e8314d0ebaa8e5ad268e72f10908c -size 680990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2bf191014c4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:19f99ba1c603e7fdf548686c1c93286e9d3e5c0e56a6b7d37dfda9ae0fbba768 -size 585907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b7f8a50181d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e97c408f02f76f838562663990c4e1b012a78250953539b47a9f5582c0c3a29 -size 566621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..de603df5d8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c379fe3d229f0bdfc263c85f0d4dd016db73155a71789f46b8f93970f758c722 +size 500069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bf34d9f67e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da4a624158434a36d7a36b2e0e48195918380ec6d8420fede5541cdc5b41429e +size 387331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f10502ce05c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d5104820ee5ff70cff122828301cc47d5185d062c8bd1a8e342936428e04c70 -size 500033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 68735e50b93..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:30943c94bb40051c6f9bbf2d2bb885b21a278833ba1f8a382f2b446f74618f7d -size 387295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e96a9fe1906 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a9228c39e8d20c051cf130d15b939c122306601ccfc1e5e8ad79a407813333f +size 519067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c97e854c869 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da3dcf9ca6ddd6ebcf423d8d45b19d3f1cb8c09e52c82655900af1a229ed3a6a +size 406969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7d5b997441a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cc30e550b7696dfbc6a3df5b65df1948421fcfd0ef9eb83fd0ed19a7d58b1885 -size 519821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index fc241a4ba4b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:816b71ce6ef7141b48f91d26d55b8e12ca46cf299a70630bcf4bf2b416d1ed8f -size 407723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0f81e75a413 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69dd9c018f5825f0e7005988801120fcefca61185291dc41c5fb48251de1f27c +size 507717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..557e7236899 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb8bd533ca74810f7fdd1d60f4e8d78ce4f1a0ae79d47c94bcae2053eba7cfe +size 394189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 96ebdf86ae9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b14737fb4483f1ec7570faefcb8ac30fa89c2edb037db087e7c13af56b10ebb -size 507681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index aa96dacfdf4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:831f5683d82499fb79da8265cd50bd92da250fd5f61816aeafb1c9e28b72de6b -size 394153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..445f75e6d87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f68b329ccf0aca6412bab2237f1aacd31c78ad05fa70f44cc32025d19b4385 +size 533867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4e49d3104ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37191bcdb2f0e5badb18921f6fa96ad9fbcf712cbfefd5321d7d0a9e94f9728 +size 413827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index a4470365870..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:670c41515b594dab1aaaaac435ce77e1ceedd90b6d96cfd020306f66d5a58ca2 -size 534621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8141f966226..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c640a37e0a23fd275460d60654d490073fc3b5a6fc36d740da7c156b4d75c01 -size 414581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bbfdcef3edf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6029dd8911cc78ebcbfa572ca470d2cb6f1a914a141850226b092500236622d7 +size 515857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7fca86f96a1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51e2522ce8d1ed31f08bb18c4622a7b97f6d2e75c289e365b8c77985b853a7f6 +size 402871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3cfaa27ff80..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79d104e3b6d6d530a2dcc85166771e824cddf7e14aa78c39af843dd9ca8e761e -size 515821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9056cde4a6d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ba4c089006883fdd297e40e990c2bb51d875ccaf257c6d53c8a96bdc8861b5e -size 402835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..73b9a657428 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee8aa6182bd9b10fc8cbfd93af9bf502daaa9ecb09c32d8d1a329977302d2720 +size 542599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..68de86f255a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5fa09bf1bd52105c73c2e04f78e1f3839f424099a3ca4b93c789ccf15a5776 +size 422559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5757afd6fd5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a95b4b053755143cb8ca2e4a737d46a0165b4812349124ae2aa75adc61b9ae01 -size 543353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f93154dfce0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b5811ffe47e59f86c996128622340cd492a9386684b440274fe125cb002f01e -size 423313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6fbf81851ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7ac1c1fa35398a8e011a67a56a66c8f56d1e1737c639204bfbc68d57da608f9 +size 496215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d1a4479eaf6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba8b65b46251310f851f27324f8b37d8b6b07f0746fe035c922bbe008e2a0b1c +size 383329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 893aef308e3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9db16e1085fe71f00f26df0fc8ea0e2711f9020d117b3f7641045b0a5ef9c826 -size 496179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9f3c3c638b2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3039e2a808d14e675858483bc1b094cf598b44796d700c195929029e8a1e67ea -size 383293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7447c9f6313 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d2944579fd79c9095e4e0485525704f5d3b5403b1fe073bdfa3f6739a448227 +size 514917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ddd433f5c84 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efca7f5a8808653908747f58ea74ba8e059251295d584c5c5c90150ca36fb980 +size 403017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 35f5a74aedb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2247043ce06828f68cac608f0b02b5ac4994758bded20025510e0aadc34de5e1 -size 515671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 69430bdabd1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db477436591061330b8cbf8f10b65ec5ba429bf15f066ee91fbe4fc1157e8180 -size 403771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a070b61a9e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a9199123ea4ffe49ac74e0d95573c3e11b52319b89a6f30bb205998e1b11e3 +size 502135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0637bca2729 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48148cac5c17237e9f0de74bc0c284dd7dd36078645f2949248386f45c842ed7 +size 387523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3e5ccc8518f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36fcc8165fc1064149d0487aff3b7dbff252e3b6165bc74a58b14936ab0e5ef4 -size 502099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5be721a4d6e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:435731b8186adf8ab440d112233cfb2bc8b0c76f4f0a0212df776f9cefcdaabc -size 387487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7f141fa27a2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32eb9b0de26dd75cb0311bdbc6ea6599f7b0c8391d72c42987de351e1ea4139 +size 524685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..26bde0167dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bda40317096a244f62e5542828d3f1d6b28d52c5b77d57e04ddab96deda82e3 +size 411947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1195ac08bcc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:100609ecf453ce9954540080ebaa14ea7a0a8d3a735c97b8040e9df807d0992f -size 525439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 94988bf9b99..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:84084f2aaee558e305b8b2617648403c83d77a7b3448e184ee5128fd23001c3c -size 412699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5a26735c556 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa3c7c6b1a61e03eac69a062c31039ce39ff9d0af8acc39b64f9f069a801b48 +size 616927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index be1fdad9fb0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:77b39c016bb796756445d9543540d91513d3ea8dff83c9b3f5a659b90c201bf6 -size 596171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2279cd3d617 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7878904aab5ea3d68a5962d22f1f4bc722f8041092d11ffdbc9b10906b9687f3 +size 633556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1979029e6e0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d2e3cf0e8de805a92030a92a5f39bda17ef6ddee19c1631e34a1dfc55afea34 -size 612109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1d13f1fa5c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb13b1b3d9c97ec04d2144fc44ae11a3b5e8135d0147e1b72864adfc3e61b24 +size 606073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index ac2d5e2b001..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb05fe9600a881c236a0c2cba0fd074d2a96e03f22f40ed4d457d5f6b821f817 -size 585317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d701d7ae982 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:581e013933dc8cca58cbd18c6346a68eef0f51d6c0a7999f98ca31c8770232da +size 632028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 07acdfe91ee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2dfc7fe224dd3cb9468777ddd71bf1e788bf7e8bae66bbf875ea63e9dfeb4c0 -size 609001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..53da7a3c373 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fee46c476130e2d6b2cd0d7a921b7ea253d68320908d353624a1564c69204164 +size 560387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e24924b6f7b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cdee8ce64b38ab842f06a7ef708b81b17d7484e802f1311b1a8ef9bfed72088 +size 426631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 32fefbdc333..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6d4b7b588550d022af058461e34402aeb90aa0c3efbe5468ab96bc4e55de8672 -size 561139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8534ccde4ee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1a98d0cc74eaf0184d9a80e3d8661ef0ca9f82fd7aaaa08ced11ce727ad8674 -size 427385 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fd3da1a2692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4fbba84f8ce5f99b4b44332d909f5c0684ec8a20ee86c82f47456d43ae1a3a +size 601037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0d6951f50d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:534a813dd4e2a277bd8c2801fa4d40766c051fac8e18d8d7e9aa47bb776f490a +size 467135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3d07e6d12eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d9a77e02d2b1760324244086650639cba150fa62b0e22dcf21ac0a57dd96ea6 -size 597695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 48702469152..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e3e54b9d7a8ee6e1b20ce6026ce3469d89635c222dd38d46c71eb323da7a9699 -size 467147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8a8a9c62b30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe965b15a3f16da727320ef38cb31f91bef0b04b1ff9257d3dcc73bcb9ccf36 +size 575091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..214fd84388c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b756df2d799f538c2a95c38c62756d66bfe202aa3254e91c9da8b15f6ec91ff +size 449723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 46f982f26c4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5db00e68894b61ebd19743674887ed06873e3ef95c7b87a29778f96dc16de6c -size 575845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 439efc9692b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd327597ef621b9f59363acc3516215a7a462632ba8e4d4586f3f9d469a8e2db -size 450477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..17da8223816 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3388b7f078f94bacd1fca9fef322b48eadc9c06dd38ebaaa48040daf51d3a030 +size 618506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b74c1c7e004 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cae2a4446ccd1f72f3b277e783d63d0dc56df2a90b87ae52325fae53af1a8c3 +size 490275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 076112509cb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a7863584280215d23666fd023bced384b2cf90bd9a4f8e2169252337985c3a6 -size 619012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index fe62b689106..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7294b99c77b76f0fc2e047044c9ced1a383cca4c7257516d5943680f73013ed8 -size 490289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..68c1626bea8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:176bc0905d31decb6fed9e74d63b6246df6ffa5f9ce57452012d5361169fd750 +size 616343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 630ae405885..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c09d68446c0f2c175832464a1977afcf68aa84316b128a73afc618f293239712 -size 596425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bf83c99f957 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b45ed453d0f2ff5243b7c0c4bc6f45ce49250b094f6171ae8c02083602f95cee +size 564679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d4e9c84305b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a116683dbd7d062d5b824e20dee33987eee862f478a89172a0b9db1f140bf19 +size 429987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0e78388ff70..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d711708a45194db8f9837b4965a1ddc564306738afd5781eeee67076e73ae6e3 -size 565431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0cdea7c2b74..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:247edff686a7e9481eb22a8c3fa91f82935cc2ad86f09cf72248d5d141661b8a -size 430739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bf39a149fc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dae520734a696922ed421d052bb44d7599192ea23cbcc912494f5057daae05bd +size 598571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..faa45e9e0ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65db805ac4803ef4c0e917fa9a1c48ca3793fdc915d4a330dc0f0425dce1ed00 +size 464471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 537344197ee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e8b3dfbfce805368a6d3bd86cd16391842de496f516111db6215a415f714368 -size 596857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 799ced89e37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f0b6ccbaeea844e76f76859528ed54e2d673c6d16d0addbadb64b824286dabd -size 465323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d093c582de9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c7789b3d3862970f69d4db6847c74f5231330209e3ad96880332fffb8072117 +size 589201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..394179d0b96 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23e2c0225fc0733a39bce505701efa51d2290c813d0be0a1b7c3207952411685 +size 453127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6cb65020e28..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42ddec16c301a20c057d4e0ec250afcdfa83bd8274bb87b0a798d9d81cb979e0 -size 589955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 61a9efdaca3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b378ea9c29ff3fd875e44372b58c90a63431fc2f213befe78b0e154bf929cd57 -size 453881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..017baa3c3e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151bd2570bd0e103397099877477f64239d0366ac989ed958a95eba8b707bede +size 623834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3757e57ca6f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b355e275468b19387bd1964bf283da62e91390454aaf6bf3c7b4e9259b5d656 +size 487611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6a0fbacbb79..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fec1da4b9239437f7ed8e50847f9a3cd2f97c4024c842727ff5b392efae38920 -size 629324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f9b9c9b3a9c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5bf4a9dfbf34bb666ede11d8853562d99a0584b13cb016ab31f9db869e47eb5e -size 488463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..29c2729122f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92f0a805209318722ee4a832dfa57bb230a7b2d7747b2f8dc4f5a2298c378b4 +size 601629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7ac52390d7f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c570e90ffdf4fcc58d6d9abbbcc972e6d41700bab0b5cca3540598ac8bc904da -size 602235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0bbcfcbca08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4988401299d6a54a3d08ba7d83e34baf7fa55a87bffecfbd53fed08993738554 +size 622304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c01f87b059b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0fb533838fdc2911e92c5610f4142ca665aa45f0d1825490f4c693c189e1f999 -size 622220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..06a6577ee99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68562ace7cff71e641a5a62fb3f5885e5ab1a9fd3b4d07577e910fc026c335a7 +size 589839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index cc77c77c044..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c20ada1c7b1a680101880e019a84cba3d04d44d81015c77c7b7fe5b3e9e32ce -size 589703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..188ba882f79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09fb1883954b91657092eb91ec48b3c1c5193cc719fb6d84aa8e491970e793ee +size 619000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index da8e7dea7a3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3fb6b42562ca860844dac0538dc2dc3b54eff41aeae3b87504a32a6e52e5664 -size 618914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fccfdbeaa0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dc92e2fb6bad5236ae23b55890411cc922e5dccae03788d0cedc156649b0a9b +size 563291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5a5c46b6568 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f5b44b681e4d5337af13d33fa56851349a56d225dca7e06ed57a6dca89e6d4 +size 428451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..50578c2624f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5650563a12076fe3d2d78024d87aa2ed2577ca248bc6997aad78072dade9dd18 +size 653292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ed920c6a272 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:548b0c11c9e85155d96db099c08b7698977dafced83f97ac009f1554d45b9f65 +size 484361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3f9403cb6b7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8aaf360dc2191cce86c168239f04efe89f0a07d8ac8a917fe56b05eb2d5bc3b9 -size 564045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e1aae5f3f4e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0138fa74ca04b21ffd1daa66bd7f24aaa40b8918564910186eaa7f607a1b82b4 -size 429205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 43b942d8f3a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe9b103d6c9e6981be54431f2df310571cfb3eb67345466ce561c2022318eb90 -size 654046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index a9f2318c05f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:591b3aa2749fc5782ea2cc7bf56c07e6485ef47285a52359d3fb74855ff15ee0 -size 484325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..97523418b7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd609c55507158c69905bc49f45c5d57e726c5a60083d05f93480af0c751e532 +size 579081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2233129ff27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0bb82062a903e391fd8bdbc5642e2f5136de2805abb5bbfa272b0eacff44785 +size 452381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..10b993f6c08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:398b0931312385175fd41f1f2df0eb52499a1e74be193badea952390dac9170a +size 677124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..359ac1d89e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8327882bb53cf689404a8317c75062462a3eea488e11ca5c0e26fdfaa40a9476 +size 507453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 45f2d91512d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c24cd1719f9be493dc4615c471bdb30fb8e4b5981d6f7153ec19dc203f70a558 -size 579835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3d82224d6db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2bd509152dd83a6b2823e5eef91968cdebc5225ec9cbe36e6c76d7c6d842aa5c -size 453135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 523a9e63bc9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:950e614aa5ee8cf6780c375fc2dd9ecad0b4bf7c325a7ccb5b81625b2c1fab31 -size 677878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 84756d7f92b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1f7eb4ed5dd1f4fafe2af6629c0b9e0dc082ad7215265bd56dedfc8938abae53 -size 507417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ed792c30bac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790cff7c149b3889b30d53dea55e334f4315f3e55faddf8e2333998a65f53c81 +size 554411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..169767d17df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db70ded6e971c5585fe4404fb61c5b01fb43baa27c4196f7062ae8350911f986 +size 420311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7726c590439 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2e2eb7cbd15f02b77a6da7683308d53167a23cb4f62f03cecdf7e1baf11a3b2 +size 648112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d9c15a1fcc4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de378810da29b28322432d6f71bdef11fa70707607798fd6ae7dac495829e892 +size 475481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index ec395f25691..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15b506645d34b1116e92e5f1657c927f682668a3d9ccc95678e6da45c1083039 -size 555165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index f94481e5647..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7dcca8109082f9f5e6ea59d057e7d205ad2e4221da1fd5343e4af9b206dbe20 -size 421065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4535d0b59eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:46f9756d89b22c101de0c2f551d038442cb13909e5861b87bfcb1f86691e6729 -size 648076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0e04e3191aa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d230273f6e9075cf34fb75770005251b89c2d1325c803cd8c1e8bf1cef6f0aec -size 476235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0a11971d7fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69bae00205b6c18071edacd871c8f72ab75633457ee8d071cc712e2d55c19de5 +size 576713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..49de3a6960f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65e2ceae0bca54037212a696b9f6572cdaeb37990362b92c5ef09eee22065610 +size 449915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dc5301ecf60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561b7f79bda7308296283e12a13130c4d919495d5f37fa0ee1539a39103e6f84 +size 673030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..40ba3d31c59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd18d5931e61c8872a846b0402e8595682a8678ea367c0a7b778bbcadf944d8f +size 505085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index febf5026b0c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3fdfa9ee066e35d896e1c69bf694d1c84c16e8f4233e3286da5aa2a52bfe3077 -size 577467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b221622ff40..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab2d67f394b69c8007dfa376cecb61ac94b88a4053f3d089ffa3c685a80e757f -size 450669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index c9ccf021435..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48eb0a0d641c9e61cfbb7956cf3bb1e8240ef6f79b7518586889cf5466539a3e -size 673782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 101e8609381..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3dbd5b6c90312f4eb9b6bd8ff0ea71d9b35226d2762c2b59f2f2f77ccaccea14 -size 505839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..91237333e29 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07a13f1545439bd9813d27b691f22f48fc304727bf4e74bc06466725e51cde57 +size 1190360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d223e1b0959 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9784a34d420a40abe1b250846599854cd746927c77595c6a59e5a664f4a8f029 +size 1162736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..231cf0ebe55 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd5ca289edb227a1858f32c2d557d304fc210f30ad3a0d74d624184ae1aeadeb +size 1087994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4fcae5a364f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a77aa4212092efabe3bde525bf7d95005996a095df1944768d6fd81420a85f26 +size 1148622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 06dcf46883f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:673bd1dc6c7916519025bd81eb92471db1372f3e5dbb10fc211e59c918e32732 -size 1176314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9f888bdf22a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d58709010617e9a1d0678ba03449fbf7e7efdfe33098b513f4c0b45052058405 -size 1153524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 68bd93c360f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8ef6e0276482d98a1df11c7f85c7feda618317f824f564d9e4b935e8a07a667 -size 1102462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9414b7d8480 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ce0adbb0c9a83a1235564e3feef2f20c26a54801e4b64034fd661953efb33da +size 1207878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..943bc6a9c78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64a4c1f6f5c0be92beca85153f6dde1d6178e0df08be1e209e49d393468c90d +size 1184002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..069f9e56ff2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88979bb4e55ff30766c91b7f6ac1690497b828d340e464f22f6ee08c4a2ebd0c +size 1104426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7a0024a6d2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f333e40aeca280217866d701627dfd506f041cbe8ab319d3ab211de25972df +size 1170974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b62d62c3516..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cad7715fb915148d3bbc0b637fcc86a4f747bd843e20b5bba73c7b655de6c303 -size 1190970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f6894cdb632..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e294df2731992947e89dff643be6d5f577169d2f6598b407846c5a918968885a -size 1167884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0b079804bcf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:527ebecac102abfc03988cae0e1b04beecc33f1cf474b9f17607ca33d0a96863 -size 1119190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3d5f5d15bee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8df0f2c0d2bffd0671b2d48dfb4afdfdd58ff730a2c473e0221b42fcabb0dfb2 +size 644780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2ff49cc3d1a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0aee944b6313f8c6451cda9a5b5db8984254b04429a1a88f180c5290bfe6a6e +size 642610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8d7584d3433 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bec67275df9123ab8150544225db4ffe10c4c6409ed95b51c6383a43aa3d07b +size 629732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7562b3edf9a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dac4da1d0f9972f95ed5fb285057d575496ac8b89fc3a8e974713e77c2e08b02 +size 634466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2cf6a3bb907 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7a9aa11d3fe0b2a08b1a5f27556592dea771974b413e363022fcbcb0b759e30 +size 520595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d3fd570ebdc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efa81c9e726d63ce7f29cfb3b4ac6f96a27c87deedd3aad8ca938235baf43616 +size 518425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5ad7c4b62c2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4691452a0ca6ac92425f63153ca6a7a75453b0ac63729e69667a7969c290b4d9 +size 506189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5637d778c2f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da45fdca134b1f3d60136ce57a1a0d5ecc3cf2445f5cb19c4554a414ff19a0ca +size 510973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 742607a7cc4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d61b2122b9e7fd6d53d8b2be755812737ba1e30ba809d254dcb62ae5028327f0 -size 645532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 059b97fd4de..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb25c19a1e3ac7172dcf4283960d71b4a38822b5673b8f4485b4f2265ceb4e5c -size 643364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9d737f11302..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:254a3320276e2f4b2f7a705059135097539d2eb72acef28156da588d918e79bd -size 630486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4f57cc4d106..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8165e035c95345f962ee47dc127a25fc323ea082c7bbe0f55d699a06ab7b6d98 -size 521347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 92d07628e7c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0dd19ffec88d19a18ce8e8969b12535e5dbabcb7d5396019a4412c48c18f166 -size 519179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 465ada9f6bd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a7ff006c5f60e99af7852a634060d5c7aba5129b26d522d70102e8097b71aab8 -size 506943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bb5576996f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a841c2db0bca7b8f68d3edd2185fc0c79acdc6fc1fdc2e2617628ee9736d66c4 +size 665700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..edc99dfaeef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7deb01e9efc95ed936c9f2bfe5a7b0193d686d90957c0370a52441cab687296c +size 663482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5c3e18da46b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaa4d92355546b56dfca36cf21fd9f23df076704f0f2a4eb924406d44b2c2431 +size 650604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9c929eeabab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6c6a1ca14c1fcbeefe379d4013b4e0c07d86a5831ff5727a4d5bf94200965f +size 655388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8ee60b65c20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:665d15ef84a2a41e0eb72474ca5ffb6f08ed5531650813c0500255ff7e805639 +size 541269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a77be530f68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f02ece725f416ee577e44c1cf9ef168fd3d191a6326810c723e21609fdcb2f +size 539101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dd2a5ceec1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28c913fd7efe9b3a4683fc615d5a57c62141b6d82eb8ce28c7bb6e078d190d30 +size 526863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..504ab267bfd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2da0e52edc69a73a3d632a195f6f2983a3c505cd7d669771739d163d5cb49d3 +size 531647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d6b1dcc5e0b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:769f9613bd4f437eda4d702f09729624d4749c89f08446a874f2b11bbdcdd2bd -size 666454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8f8e1566e00..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:458f909379ee25fe8446f23a1be12a497f219abce8b52581f8e340f5c3421715 -size 664236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index ffe743a1d2b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34f7559c449a0e7581d839f5fda9fa46b062da02a4f2b9359583df01155b428a -size 651358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0c09baf2147..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05854454812c42b4da4ddc885b9cf44cf5c180d711105d8f5d492fa96da3e896 -size 542023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index bf679bc0abf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2c0c329f291e0b986651f225aaef8fc086bec5ba7538e7f3e9c49cd27996626 -size 539853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7abac02035b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7461ffa6ce5b8825d0732cfaf14fc9f38ee2ef722eb1588408168668dde20a12 -size 527617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ad1d1744b95 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1ff8a131670093b02b7f9f0a17c9dd01d6620c599a6743023b802e75d75a03 +size 696392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e125584043f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957cfbafabc33fa928df8bbb2aef7f39c1b70c583c1d6b28c08f9d0aa1a516e9 +size 677054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b01b2be5c1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb9257eba5e9fc191fd8762191dd9c37de3b0aedf175852dfa40675833c44f96 +size 680998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..08509a91e13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b39cbcc0683b8b71652f7e777cc6f0e9902323e860214cfc2d47415544c106f9 +size 696738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..31880d21bf1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2b38b5ef7c2b13ddd3cb9fd29534b185c6fa6ed35891f691b1c6ac2b435716 +size 673946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b22dfb066cb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a9572ccb3c60fec2970073fb2ab5a6e2e28c038b074c8693c04eb6eaaa372e6 +size 677940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0bcfdae7144..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1c7c7e1718aafa989a444924deee5cb447c9e83c989a695adea9e13154f0f6bf -size 696356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5ea36ec717f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:03cf8b652148ada8d03cda73af2d36cfe3a9ae4d298efbdd0fd1edc2b1401284 -size 677018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 42217e94265..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ecc7e69fb75ee362f4f51911b1f3a82e7fc77c9e1ae00613bc5c9e2da2988fa5 -size 697490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b898972ee6e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58e33723aa94ce3aa6b78e01894a54cbd2eb808cb145356bbd64657a46ba7e83 -size 673910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..208fe32e834 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a467315f28f41c935855857659176e7db4a5d629eb13cba111877f580657965c +size 633870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c8e13200c73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46858e8b3ffe01603c7fb285b4d3bb3a58f5efb7f6a3c374edc2639693fc582b +size 631652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..329c2b7efec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce37f01cadce9262358e5548283c6f2c02fe5ae438027f6b7f58b175a2ae195e +size 615517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..996f59659f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ef3d9079e5bd84f9fc5ed0dbc8bb87508bc9821621d84a5082b133fd72b7aaa +size 620252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b19df27b3f0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72e14cea09333037820e6eed1aed87a64681038886b6de6e792c488f34707560 +size 502681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..647d84beafb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:712ef43454c5ad08ec61762dd6a8cb6831bbd4eabb5b64ab348bbc0f65086138 +size 500413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9c13154b207 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb99ec84aac88eb988bc7c59161ae1255b26d5b9cf33a68ffd2eb9b23282fa0 +size 489755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5c3e636cef3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d25b141b6cb47c499087ac0ac7cc81f964e2e988010dc71c7ca776352e55df +size 493749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4007a0a9033..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e4433e710f49e21fa0a8bfb2bada9bdd771c173a5fec4ef9f4430d06679e2cee -size 631516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 97839bc4053..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3006390b5a6628af46dd81578c682094ada87cf48cc137f281fabb5f89da00fb -size 628508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index ae02fa1546b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae95e5490bd59e38bc941643f8292d19b9b5bdaf7ff945d645c231f5adf6581b -size 614445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index cd965e891eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48c5ed940eb728c70f1cc03ff977884b0ccbee557cb11b0eed81f655f513ec90 -size 500621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1b6eff02d4c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:743bcac21e80d2ad64766d0460ee1611db509b981da9855dbcaff323bd78c8f8 -size 499095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3276938ca9e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:193614b16c678f9d0ccd2df74dcf71df019e8c2809f65255e84604aca62fdd3b -size 486907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8dd210d2be4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d90cb51337c69e4b0421e23b8fa6773df128d73b84e10547d39c6b3f7075d68 +size 657850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6b30d89aecb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a39b3d27b974c846c2a30fceb5735b5cd5b8a0d6aef0e2461e80103f566270c +size 656422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1dd19929c8c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e77153a743b1b7bba4d64f4711fe3f56bce1a223e965e9305bfb80fd3ea8774 +size 640190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2e57b721f49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79bdbf9d59dbaf3efdac30ec55f0b579c9a592e42aaba1fe271471e7887fd42a +size 644924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..287b6da6682 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:099fe436b7f1c36292954489a4d2ef78e6d3201c2f8a10d80cb5370e1699ea2e +size 528189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..984bc7f6e17 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5bf4cc62e95529055e4f48d7519853082fc2cbc582c74b774d884748f8c6721 +size 525231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c29fd9b0859 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4db06510d92f6c7cbe265bb78a7c639fdc8c81f4dcbde83dc611ca79238e91b +size 513833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..da6a60a1240 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60fc58181588ec6d43a324de8b506e5726eccd5f1b695bdcc6786db69bc7ce23 +size 518567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5ceeff81787..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e981fc4d351a52b1609a696bb879fc4ec18d143a59bf85095596b48800b5c4ad -size 656186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a675b719faa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c894a2fa3468ab6eaf6fb15c5454d3a7339e5cc4102c5529839ee86ee29f031b -size 653180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 443d4c8ad7c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edba8658c79a0b34f24d6f1b2afc221a922ba9267e18dccbf076af512ef11a37 -size 637834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 31bdc306345..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6084c5f8b5b411c0d0f49b6db03bdf23e36e6afb5d5627a4c9dee89b54cd88a7 -size 525341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f0da5c7d1b3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10125f7fef72677c6594d49e52c218cab814393f94166864b42ed0e8369017cb -size 523173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 77148681a1e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af30a5f6c89c482f2596ba306dab9a671a8716c16dec5fa4d84c5a3eb5731632 -size 511725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2a51744bc30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:392c728d320e49edf35e6540d80335d686649133149fd81eb46fdb8701498e31 +size 836034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b53f1f58d3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b5e848ce8c62f481c36415961085623fc4f52af242351cb91e0cdfe9b47ef1b +size 812158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3a80a8f117a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0c39782516ef5a7146ab1c536f54c47da3f852ae0a168d92301834357740f11 +size 749354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..be17b633fc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_silu_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b76b832b518159f34103b40ad02cd45ab3b19f2d1fd7ef588be46a35a80c197 +size 786550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 742917c29a8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_geGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4e2b0cca4589e387913d1ddca5e98478934b51215987266f2bb2a379c591c1e -size 833678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6955c073336..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32dc2f3907a4332c09dc08f5f260d14b41c956767a1ce6e12d18c5b5d7c252ce -size 810592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index b9b4c7d0cef..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_ldgSf_rgTma_clmp_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:430948b4639371d964740347d1f28e872a53f99b578e2becef15fb082bba7716 -size 747148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e287597fc28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:140db5250add529a825972053005c442934b0b330a59eb6ad15a3bddf47224bf +size 695642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5e8d8c49237 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf047d821bd76a88c12b8296f4ba4a195b76a24c39072b485270d900537975d6 +size 690464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4e50d253be0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43776405299f25349df5d6bb46f6862800e8f3a326df50f6e57c143ed99e8e1 +size 663132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..165827965e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32b8311ea5abc24c867fb20f77f412da62c40dcd351c28bbd9f2e6baedffcafc +size 672700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..005adde081f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65eba59fd134b6dbd6f7fc7d41124c7aed1cb150849b5d541b54a273a2fd717d +size 573579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..51c7d749692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc5ecfda7269ca110176b244f67261c1e1a05d0655125a72f38e6ee9f6a0989 +size 568401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b6fa0d8614f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7f47a7c1b1871e35aeada059e8018d8a69a11da1223f1dbad76e4af4a0f757 +size 551921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c53a39e119c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c75b2911fd95ad6c81ce4ec8689acb539fb9d35c83faa15d2814776aa3f68c66 +size 560701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 92cf0133d37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:73cbe62a71d114f0ae133a96e53feac979469a5d0fee643a090ab5550300be1e -size 696396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ffe0ba73736..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:194f21acb1d42e715ab4d787f60f7e751c8f9dc2c400cbebe383c344eedc13ea -size 690428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 16ac054ef71..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:81b5e88c6525aed5422b2450974efbd81185d3d5c8f271acbdd996ef275d6628 -size 663884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3e40b6e338d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f14801fd0ad6437868976b612b6bdd35547868f36f7699f4c01e1a1581df83d -size 574331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f5d277cc752..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8fe8c999aa8d477b72d287833bacae3231eee5e318686ab9087507f9e9f0e95 -size 568365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 75b4936440f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:263fad4505e9ac2375ed73d797f9bc90800536da66ecda326e343fb6c0031f01 -size 551885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c202a017fc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bea7965047f198914fc3339927f9cab3bdcb81e5ddfd8b1be4ff5b4d64f7a25 +size 714688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fb1750824e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd43c54b2bf2e7e997ae2eb6e95a33f27cb41729f38be42b19f976750a61cf8 +size 709510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..72773aab41b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6257cd3edc83d6e9f0bf03966fd2573af6127e6019c730d8734113694e0a989 +size 683164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..817427dad30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3fc567afb6a9b5f283dbfeeffd9a3ed45d911fdca7b5914a1e32807c046af88 +size 691106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f51b4c550e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8365adf7f92b7c8468fa717e119bd6df97d1c4754442edb3cdcfe50fed6113e2 +size 594303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1e8eaad26d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:290e38c4e14902bb7edccf10b0b2ef6ee732b18a3a166fe30a8914f81b18a2f5 +size 589125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..49209972364 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6d88d3b6a1400b5e323b22b977e417b71b48f21c0951ce8903c82ecd4e07a4 +size 572595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..04a80401bb9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538570baba64dce78b9f5c96fd2595dfba5719b5b8a1e7a630a60efcbdf43ec1 +size 581375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9f3c8cad18b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c64c8090cfcb0c52b81f010fc16c9d69cd22fb41ddd65aa43a144453ef0f191 -size 715442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f42215ba6c9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3051d5c1456d00deb9017d54f3ac6a9afdc4ff991ccfdca943113b954dd9a60e -size 709474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 738e2d96380..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d80e0a18c7f368348b42b088791e728f0e57fe389371b9c2793e7ea9939554f8 -size 683918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2d8c9ea9bb8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d7d471a5bf098637e7c46af675f0dc8d6a1239efc55ad122edf631425734f5f0 -size 595055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index e362c05fe60..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7148d74dee3b57479b2b405b4f8e7b1f439058407fede8e3c63c8d87eae0605 -size 589089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 475505ef012..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f7b7077ddc35a6f3058362d882d85467833bbf6d35c09cace7429f9e56f04bc -size 572559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..459fe4bc32a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd5d30018ec2c0c5032bbdee632cfecb81f8a66b65dfbeb3d24408199b61fa58 +size 744788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9a185620a52 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfec1996f8229512678bfb8db6ebb9bd90689f10fe6bf11082094b1abc4894a +size 706900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0b7bc703fd2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c998a9bb868b9d452724c4c3c1341eaeed30ffb1cfb6bf6bcb48b259820413da +size 714840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 08e1dfdffec..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b3bdab1f0fd7637b567cf679112a3e4a4861204046d195264ba8464717f32f69 -size 745542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8b36a525717..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:581f662df10ce856d69a4982ef9bc01b66d2ef20909b1469db59d1777e40b194 -size 706864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3ec2f7f2881 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc347a99a67fb94bab28f89f36dcd36aeb000a7b7966b69fd4726707a16cdb99 +size 665346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f814ccc8a28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81bc88c0dded79bff5eb254b6f39eeb2206ae051b19f97d4d5a08355d6e321c9 +size 659378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..68faf0d0c19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc460505528c69f23a5777a07789edecb7e71d002338d852250a60ddde68291 +size 639396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..aa46a91b7b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f0e0b37a1d08220adb81f166cf918f805701f8e06ddcfd9352b424ed16f3119 +size 648028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f46601f2941 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c56911fb6fcb3260240fbfed2577a1f6a360fb77bae1e3b399c38b6780894a9 +size 528137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7d0d94b1692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1deaddeaf2108717560cc2c6b2a5a9a81969d7f44a3ab92ed7a7dbd88ab82f3a +size 522959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7169a071d2d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30044b597298a7681a20d9f17bd41683854e7fa7249dde82ff69086ac8131849 +size 507367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..102947360c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61b89daf624bf330f5070b82316c5527a3142c7c75f94f864f05b5f3bdeb0fdb +size 516887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index be46960fb8f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e525610d6a8c5c88ae0ca5eaab74e554a09121167a0dfc3b00ce4b3690af8ef -size 663582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d7b009faa41..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e896494ae5f19ff0196b3e8153a288325b1b7423d0eb6bbe625b5a46a10a23a7 -size 657616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 687f9199804..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:09c9f972f80322e3a630dfe35f08b8b9027ea6b8a98f67fd87e7dc201207f389 -size 636844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 019224e3b09..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea940f1c4fc0c9ee76dac8e9c4692e38fd035d49abf2fabc2184dca09c1376c8 -size 526077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 26ab794fe3f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06ba4338a5ba2cf2c1ede4eeb82fa1f0caf6607716e587f0cb6a10958e0aed40 -size 520111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5eccbe8d06e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9f95ec711a387191fb623fe9d6608586092596cb86ab09f51f2f9ead9c9275fe -size 506147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5f6c3cea2f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6361a3fdf02901d67df2b62a95635e028f69424fcea27cd8a01bb1f87c784af1 +size 694802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6214a687bb1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:714dfc76776e4a8489cfaaebd11aafa58b3ffc41a0f6adad35de0b8f7ff1ffab +size 688834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..be4b083e40f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee1ae4fabc99f0d415e0fa65997086252e5a232fa605180c3cd4cddb9b2c848 +size 664362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6cc31a70f0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0266660380a1e4b1e76fa70a0542d1cdd2d769abbe32b8990847b4bd0a2bc7ad +size 673832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8bf09947878 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4164a90209efa8f19e9d71febd9be60f03d6b9f7355e934b29d35a6af0a39bf +size 551969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..22caf6d2055 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82b0c0bb7055786c373e2e7fb17c7627977c3d7dd36f0004fa49228b9a12d85 +size 547579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f62154c141c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36e6245e0cb5959ba5f44823f78376c1fd7e657d71ad3712b171823505d29c15 +size 530311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7e5ba2108e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78390178a5846dde6787326e05ca5d3f4f319a8d735600baf12bfe344e151a9 +size 539879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 292cc9aa737..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:142baaf35b5c21612ff27e9d0fc7bc9dad954583de9a0c8da4b57823c7381b2c -size 686774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index c672acbde86..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a22f778662c1540b55e63b58b3d8a956c944db9845e184ee19bdfe28297ee90 -size 681596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0b3fd6f0803..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:50a277f325c07c2aa9ce2d5409f88911f5ec70cfe24706ca8e97ea6d06f6a3b4 -size 661810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 88b06d25e9e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a14f4e7ff483cfe171b3144301ba88978526fdf57e1c497ec5e623e09fd41dc2 -size 550749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index eb29decb155..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:90b45482178d00075c8e6f3fabc05db5e29a7c5a3d8f5cbfd2aab285d58a0be8 -size 544781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6158adc2799..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8de2714d16628b0e55ed6ec52ac496009cfb16ee5686e3032114c7a82a9bad1f -size 528253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f12312ee2ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:403343e348cb3da8468c610086d0adfb37b6acac3173c3e88ab1e65ee43f26d2 +size 909650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a8d7f31dd78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a3fa89007aea5f6c73c874f770263d685f458a54cc5c975baf4e57d1b2ae28f +size 898502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..86a3d270555 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22b6fc4e09b55cbfd7f22ab1d56b8e1e9486e6966f7abc4458f99567d9a6b1f2 +size 867816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4f1a0c85f68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ff3eda51758dc0a08bfeb128a1e2f86c4f24e6e10a589d069430fdc3cf2c33e +size 888534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8af7d9d80eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe9f3abb72623ea521bfc375a286a2111ef950b0c42ef47878ac393d1bdb335b -size 907888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8f89ec3e07d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4feb4ea798816872334b799d8eb6b139334c177ec27f73f1f53e3dcd03ffccd2 -size 895950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7d2bbca3243..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec505f6db2cb5a7e70c1de79f25529f2bd3a89fff43b545ebc869d61deebd153 -size 868026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1b83453a5e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40d9963bd28b7f99a51b8f4f8055c6f311ed89492de0edefbe3ddd0b2e730699 +size 930276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6fe0dc7e667 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ced7a8b43c45e6c74dd891b5cdc7d252ebcccafa25cc9cfdc9c111b6420ba55 +size 919128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c8750681192 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d271a85f675b083b517c6f9f0992156f34213a4c13ecf4f188ecfd5cae40927 +size 888194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5f1b850f248 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e743d381bb90a374228cb3027a144765fb33b9ce18f8d06a1b8e974d6f3f52b7 +size 908912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9fda233aac8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9941d66c2f3e3036c0e8b2e86a967de11693959e6b74f9d402b4682ff988a9b4 -size 933200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index e12a7f582c2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b8a6e72ffd7a3644f1aa1383c58c794d8bbca82900e21d19c640eb546f257b1 -size 922842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6c314868873..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c190654bdee9b8dbef2e2d7f2593e1f5d25d1e01b67c32a096f86f967d139ed -size 886628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..95312facb75 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966ed2b70fc52627c84bd546e88855fb0a7e387af0b97f6a16813ddc956e2b3e +size 733770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..91ee7619cd8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ba10494df124341825d6294c19a47cd86c78cb5a28797122b6a1f944977c37a +size 722574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a1b7167ca1a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffb4999d96fbae9252e52ab4a7d0e2f587c7d6f8f0205d50987353c6b18f28ed +size 691936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..88ce207ef43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fdfe2d46ce0c30ae0ee23b3a49fe776184ea93d53cc2b15c8aa42ae9136eb0 +size 711816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a8c517b1ac8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8450a1e7978618d4f58608bd01c3bc0e10583a5ad991163cc875b86a78509391 -size 732008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 15569a5d974..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6c5b5c0a0ec3130d76cb7f6cd16edf5b8ad8869381f7ce24c2b8cd58502012b -size 720860 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4183163148a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9eab3f726585a68190e514dd6c1e887ea7fbdd7ae9c72432783d0660e3405edb -size 689334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b9768dffc40 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64655bdafd099f14f498806859644ed2e0c43604b65741bcd412cf4b7ec61734 +size 759082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..06c13407f86 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b61fd15065a4b97c50c4adf833c6963df5af2f0bf4a520f77bdc494eb838cf +size 747836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..daa5b313a8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d764136daf299f593efc5367b829fb2fd22bc229a1d2594e637f9bd1186e7b30 +size 714436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e9204903e3c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4fe866a2a8e0d58d9f87c33403f3dc6903a80c43ac14c634494efd5f3d34003 +size 734316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3ea6dfab238..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36beaf972a9963534060b7870692338673deba5a0d0e2554572b57e83ec2272d -size 756630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index c92e338b9ae..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7edd35fa28f506bc3766833fb73e91ebcca07a74669f093d5220469a71450656 -size 744692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d9bc62a2a3f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:80d05f404a682b05f0f168aec3ac9084095139a2b583662dfd8c77815363f12c -size 715140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ae0b1948c6f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5582643cc85895db7cbd38fe91dbff7ff2f103438e51b5146df046f6a2c7376a +size 609993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..24381665239 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1660a9389919915e22f6dd4bb19038eab0917419b2ea402069cecbde4ea9ba23 +size 608515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..49d10c743b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a087226bdfeb30ebcf59f96c6366d3490d8796fcb86df3c5d4308c2162a94f06 +size 599237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b1e3975287f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd44b9c76026db519dcc34d6450c67751fdeac3649856bd526d31f9c7f91c8fe +size 601653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..eda378ebbba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e98aa2b3fb0332a307e561e07677f4169a613d00b03701f59f957050b9beb5c +size 497205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9b5da33d988 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9c19586c7e67e85ec237efeca9c00f61dca049ac86f1132d62ddac672707e50 +size 495727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a23e1867ea0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad6160e95cdfd591d637dad1e2b0d101ab974212d3893e033264680d261f55ff +size 488621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..50fc069cd87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be83e239e51fe043eef45d2a8026c3202026f752dc2149dffbc3b97f97a07084 +size 490197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d95c87a8e67..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bfaa46a281c2b41b77c3ffaa2ae48a5a98d3bb488c2f0a9fedea2126a81df2b1 -size 609957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 886094fe8af..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d342706d58e4a92c3f8dd3dc4dddcaf0cf513a118752f8c3010ede508fc88a39 -size 609267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index b4ad82bdbd3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c71296b533d7f4b8dfac80e3a37047bde7f1d34b521a380213b1d91daaef1f85 -size 599991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ec96dc00694..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:235e77993c8e07f916bbe048c774354d1c84c362407f53fc40d8c8b4e735bd31 -size 497957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9a6bcd37c12..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52ab43a655880a550383ed77335a9ce2b73b272b5f81d58faaa4abfd759fe7fc -size 496479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e470cf8052e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:25f793d56d7bc9a8ee9d6a29f2aa70361d1fb62123f02b4e615a8f5be48af32d -size 488585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ead86b21b58 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e96a47512fb195db2b2e19e2072ddd43a5d526e1418160522c98469aeca149f3 +size 631014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..70417ea41cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b778e55224d7eec04ab361a0543cbf1c6b4dd786cab2bdaeaf408956d9e7916 +size 629536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8300a2a965c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3cd9ea8d40c2c780f3fb26ec2a30ab8a07bd489cf43790bf892a8976697b2d8 +size 619272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..27d3d58a884 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b23ae4afd3ef514286076c9b0da20dbfcc08f3f1b2acbd92d1d9e3ff56e2895 +size 621638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b3fe7672cbd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c7ed60ca6364c97899b87fac85fcbcf2aeedb132e4132c9812c8e30db85701a +size 516645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b52df3b9b13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6342dfd0aff099679577bfcd775c0cb73fc8350f90e3119ef05e7310621bd26a +size 515167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2cda36febb3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0396f679bb56721fab8f3d64c4fe02a9decc52b485dc3b4c29f3a875c7bef6b7 +size 510035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..db5fd5e88ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c518c169f2c384cd04b6cb1580514fb7fd0ee1d8b6ced828f79d57ed38d8d503 +size 509687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4e800e2abb0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eaedd887a20e812b6036230294337e192bd71cde16f08e9bb91a9e6460124d0e -size 630978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9dcf6428adf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b2d9e57a8ce5d426fe384b3553329e6742b2926d77a4d2ea15b74c640387fdc -size 630288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 566e4cc0e81..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c4b699c0f49f96aef4b4a4a3074fbc1e945d6a787b881a1465612458335877e -size 620026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index da4cd811e02..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20023944892f5a439dd391b1cfdf092f8f477dcda63428181b30da340dd578eb -size 517399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 434c3d521eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccac55a6aa9b13cc579b56b59c5964a1a390fcf15477b7cc4bfd0e13ff1e2968 -size 515921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 10ee319bfe3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_ldgstsSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:819ebe2f91611c878073ed3a08fb1bbb21e1b1b50f2f6e3249a2e594d07067f3 -size 509999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..dc5c2f1d0b8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12eb6a2da853282837bb1481779d65c55ca30908ac0c5949a2b03de643377a8c +size 424685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 9778b80bde8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92df9a0068c0f3698222aae2646e3ea179d9d721dc13b7dd399dee9e8cbc5b8a -size 425439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..713faaa4fe5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a57a1bc97714db270e4e394ad6f108cc5b14feaaf2207e4367a6273245c895 +size 578955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e87c86fc49c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9879f25cccac7cae69b1db303d572277672bd97c13d58996e4bcfc80e274b94a +size 577477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ddb8bc04200 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb250796c250510726838d258cb94953ce6b80d92caa79a665600a624b9b0a91 +size 568991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3550865782b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e88540fd3584b4dc585704c307fbbeaa48ace7f0def98524fb92bd613b3f9f4d +size 571405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9e1379ed38f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:391b8b0e99e15b625fc281f52fd307a3f5a4300b7e9662ef54f5708a018af92e +size 448851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4b75f3bc0c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eea3fbb7139851e184f32dac924227f67dd31d678d90b92aa0bfee9945d1d8b7 +size 447325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f00c4c580a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada447d4dda0dccdf7aa4bb5942f8c11a9017e7cf34967e2e6eb9e4333ce92ee +size 441255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dc364ab826a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd1e748b990f7f595b1ca13136391a28ce31fe6f800f12ea34e1c8284204daeb +size 441845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..30b1f9bb018 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf3e0a2b7126c059695da62513d2697299ead2f32d386d4a104c290f463074bc +size 676850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2b8e1092cbb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41bf0c8b7517483a6eafcdd5bae2aad59245bcab56c84befec49d253bfd79ea5 +size 659682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1953428dcd4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a48ee11195b1096233f2c4d2fd4abf75445466d59186f02a6acb19237a5f4423 +size 661258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..73830167e68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9753763545f81f8a45f4366a9da2febbd490256a5105fb348e0f431f3708e0c8 +size 678132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0a39766a5f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67316be36394607c70ed486a8aa8f394ca010d4b83bec29f20bde2f47d63af02 +size 662100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..00434d9bcdd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b48f4fb57876b6983eb43d62e58d578800cff7a40510e85ae6fc78f1cdf59c7 +size 664466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5b29e6c0525 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73b123aa5d19d776a37e3701cc6c401a45513f76f6ffa856615a44b4f26e1274 +size 679316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0d545d78998 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa8f9e429fd845cfec98e837cb934232705c9ce430b171aa2cc5f076c67c02a4 +size 660472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6e3820c96cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_c1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea0c33c4013c210e07cd1a2e539b4bd35ee88e3cdaacc58423cc7e5c562365f8 +size 662838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f7acd84b41f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7838faef1f255b6a2f123f0a7287e6321c60ab41fed5c9e77386396daeeed96b -size 579709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 85ebd9ecffc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7fd48bd5f64b2f58fe54738008ced544f4869d30dc98461bb258acaa9e90222c -size 578231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 080feec37e8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28bd82ffd0c65ade02d382c42b44e5e3f1e626d1925f84ed87a90c68a010d8ae -size 569743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2b949a1caec..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:99c9b47754aa8878a1721e486938e7a2027e687af85df79a6a27a5c0a3b74c7a -size 448815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index dc7ffcf5dca..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a6e473d70dbc95787bb3b0c2d0518f7504fac657e1dfe4f535f3706bcfe3966 -size 448077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index cf77ec972b7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5d30b5b870c5acc78f3493c83731d4ded530d6cafce15922b7a367b29c6b545 -size 441219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 10c42956768..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc9720b4d3914fbab942de0ff68112a6ba3a86acc55e4159013aa071e5addacd -size 677604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8ddb0d0dadf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68cef5ed24bc3a8e75a931a7a2a9d679a5a7d5d1ee1347426d739a057b691c8e -size 659646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 540bcbdb377..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7bc309379f1e4d4afab223427fae7cb3c85c5a5b6ce82207b5f18de6751c1d60 -size 678886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e4a813f2eb2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a97325fa3899d8750937927c390054ba088904ef2fe567f411b79cfd301b9e2 -size 662852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d99325c09db..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aaace32ba6c8e9b03cb4318d190ea103daaca04e9ce925f49c15233b1adfb295 -size 680070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e6c5e9be8c8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:39538a40066332d8fb18e1ca7351941b2ef3689b8fe0def4b1e51c6e31c6873f -size 661224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..2824494ca2f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88a1dac357781079a31e733ac4a4a11c44a0fe5e0a363a8baacb68ff50844e9d +size 448617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 3a9c4fc9498..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:501f6feaf4931387fd80c1eeda45437c3df938c22724881fefaac32704dfe41f -size 449369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1b92fc10616 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c48be1d48728edb50ddab14f13768aa7aa18c5bad25409f6dd452dc23369dd68 +size 602147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d71288d13e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c3a2ea19aee2ab0435a90d245922a2d5695511e9586cfc8ba0b46d0877004e4 +size 600669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b136042d556 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:079d6721a7ea482cb0f334f17fd3da3afaebe72b33dad49f8cb29c0818477a84 +size 592379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..24afd54e3af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a53867f8a4601061810e3aa7c4e9c4708b11198112596e27e31e36224cb2ad +size 594793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..209cb8e6daa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32ce25cb9216cba59331e0ad867b94e0a79d965b3d5117b92c534c332af174a0 +size 474263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..317faf50f82 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:978239b87fee5aaae0afba6d125c078a2ad8686fcf409a57142c27624d429957 +size 473573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c57f631264c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:531b8f7f765146bd99a7c478aeb22ac9cf8ece9562c0911cca5d470341efaae0 +size 465679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e2839617923 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ca8c231c1c507854c752b69c6149f874b4c56cc70ed418e4dc9019439b5fe1 +size 468093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b8c64af75b0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34faa9afdae6e96b5cc6790c173da975bc7c96ef8e69febe0da8f6143075a678 -size 602899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9c56649e86b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ca080f32e23af5e590f62f820aab2c6983c8d59a53dac42027109c9d4a3fcc7 -size 601421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 15960d1eb82..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:915da9064326785b02f74b3dc8aac8b2e17dfa96ce98a44057f8e630c241c137 -size 592343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 584ec63e731..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_geGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bfd2413aa4057e63e02af9570a9e5355480b42961174733475c4389e5087def8 -size 475015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f27ad300d18..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06c488c5d42f3f22665bca689863827a23ab59582ab0a704167672d2b5ecffb5 -size 473537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index abfc0e7412b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a975e41b2afa9d778e9cce6e87b948a0c2b5b6cf9d472a7359913cf8256020e -size 466431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..d9ddfe8da3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12355fb82ff14a8c11899c6471def92c9272177fd3c1b40c311dbacc0099a608 +size 567577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..828ae589558 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24e3da092104a5543e0cab974d146988d533ecf2e5c2a530f681bfa3e22bfe9 +size 570637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..9550a7c0fb1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bed009a68d35b3d0b2d29105be18fc507075e6b222902c893bbb74583d241112 +size 457059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..2a3e372fd09 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c147da1e103a7bf4a35f9f0bc959ae7ec1f273e355921e090ccef6e8e3c08bf0 +size 459781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 56f9d651447..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1298e19b2d7ae38ded76d4987e952545db272a94b67295815802bae64ee15f1 -size 568331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 5bdfb3129fe..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1caea40f926e0da9b88160484a0a991f941070f994f2817ffaed811fdef12ac7 -size 570601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index e8f5b6d94aa..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:792c3589ae7b14ecb68ae3ec68d1e5e47ab205f4254ea110f1cba0fc16f07956 -size 457813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index dfee57d1737..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1b9d8376604d5f32734124ce821044843c33e2d02b7017af05b935e406a9dec -size 460575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..4513de107ab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a811aa2f6618ebc4b3441f2cfb5f01bb9a23799885d7a5b547d054831cd48f +size 581849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..0133a15be30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a964581cf4162c55fa3af0070cfe6e44d044ad9b9458657986b85717d7efe999 +size 584117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..50089392996 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:925b4f6e4195875b6923507f25c8781d93d984d817fbde1dab3856777a1ce7a5 +size 473105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..74467ebb475 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46b3d9cb5c917ab0be41901e7be9bc79690dbd73e19b187474ece2b8716b66c8 +size 475079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 572c349aa26..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8c6185a05deb42590943c817fc4d11c152362b1208bf34c309601b8e55ca1a0 -size 581853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index acdae51fc35..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f361d4c389cfadf39e4f493fea812da19be067a352a7f5e65e7d84d9825d4348 -size 584911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index dbf23023595..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9717ee135b5a24173aa669f742397e65c53997668777abae681019b217e1d8be -size 473109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 3eb3017d06a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fcf9d047f91bd96f8af4f61d003ca0b9353a5816c2fe9d465b6003ded6ad341c -size 475873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..567a2cc6ff4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ec1946905b8faf0c652fdf4a49cf6d0a9e0b2cca38004638f60e10512714a8b +size 588603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..117915ccfb7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2cbd8149ed311ee8a631639625451f3fc73f57ab1dc4e0dfe2c40be8e7f673d +size 592451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..4aaef9f1b2b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cfd55ab28a7f6f7af4e8e131daa296aea23600d5bd8369bb6a59e5d4592bf36 +size 478281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..a5daf770c8e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a487debcc0928faf16fa6d2e6de81d01d3336494bb529b3d0d73518d4468d731 +size 481045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 4a57b38ab52..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:917831ae10b8940efe8ff630161fd0312422c40f6a1915b1be919edf92cdffdb -size 589397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 69ef988b3d8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:355b6734101db63d10c114f4aa84e10da2834aa8822846ad023f51596def5051 -size 592455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index b307084e1b1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:150f668601cd3c01d86bebb929ebd8497b898ddc81b39fdd4abaf88359502498 -size 479075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index f16813cad05..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eed348993fa798cdf4beafb677ee78a884ecfe0c905debd14fdb9d73cb91d4e4 -size 481837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..525b9a0205a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61102a1bc4d58b78221a15255c0f9f7eb438407561f8ecc656f66644a098b884 +size 603703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..8d7ccc41b13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b31975a5be26e15f2aae776988ca02e883bdbec072cb9aa3a044fcce25c7fa53 +size 605973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..e0c01f3d7c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44f3c86fda981f1d8ede091d91fed98b3e12c0fe7c26c27c8a9999303acf274f +size 494221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..76de2bf68ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:143eab9e670bc250d21d721474a53f982b67d6d3fdcfb0611c8f8e5316306094 +size 496193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 34e1aa81dcb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2298bf6cf5788cf3300b66bb9156446967cf047016bc0174b87b94c2996e369b -size 603707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6f68831ffbf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75e8f83c3afd2edc0e533db435858b57a7793ab0b2532d95d2217ec02c5f81bd -size 606765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index bbabd83d8f8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b36b79b5c954988c438a24251d2baf74f696caebdbaa216631e61e7841af6584 -size 494225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 36ce8d1b163..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7d29000078aa1c08f4f1d4f4e5ecefdb22f2b0051f5e0894c47bbdd6cd58e9cd -size 496987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..daeaf787377 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b73e8167223c3a1f399cd848fdcbd103db97da2c8e409e5a92be886a3667f8f +size 650370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..49dede27749 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea98a1238dc6721aad34181e37fc0e4ac0de596a81c7035214434a9818756951 +size 653428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..47a8d426a44 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc88494dc9462b7e3193513793d3b8a2f79083c135d787200037fcf1d9f6d934 +size 515627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..308b33e026a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8f1f3d3d4f9557fad957e3b838aaf48042ce790c4aa34e61726472d15c24304 +size 518389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 4fa5ddc429a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05b6467613af8744f8629121df2f05895617b7ecdeb11b4cb95b22a8fa9509e2 -size 651162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 8e369235a3b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c15f7197749a025b6afef0844b4b7bbb2e3ee88edc488bbc81d5763c3d7bf96 -size 653432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 29147513c16..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f2466c83a355fbb5ce491a385b150aaadb48d3c5c5f5e8f77df0155e929eadf -size 516421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index c46b383d727..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2f8287828060a7e1dc7f1464b5675b377641e206a40581b1b20fcf15be1505f -size 519183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..f03671886c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f91c0f0aa0cbb7ca02c07ee1ed49366e946e2ffac69332586c787da1fdcb214a +size 664878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..d9cf6b397ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5a2a4cc6e2e0bace16b003fdd36ad1bcb2c2ab6322677ed0100fa8f2ea342cc +size 667146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..7fbb64f03f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13570fbd8018fc013d9d8843828122135aa558cd50f0299fcc518ca25a25c22f +size 531565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..6bf77502001 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50b1428dd5e776fccb42032579ad49aad86ded322159bcdaa058a3769449a40 +size 533539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 6d80b4ae538..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ff862760a0b4e4f8425494aba7efc3433f37b1cdba4cc39a173fb7cd023c551 -size 664882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 35ea2d73fda..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d696e0e132dd4d51567a92dfd39262c326f897c5f78f00cc779fa83f8c6b601b -size 667940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 41c55a0b72d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3acc3707c1f1e85cbbfd645e685f5baad4ccbeef3fa2f248136da1d0b5810fa5 -size 531569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index fabaef6785c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc36ef026d9b00358ca2619ec055dc6241fffb77fb531b8f845ae1b29e7c4bf5 -size 534333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..c12dcda7ba7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b71e7707ae745429c5a60ee74ad414d23ffdfe6735900447686c831a564a5b0 +size 556135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..cf1195fa9fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44bb3339d733e280c946a4cce6eb20e517e3931804c2274942824afbbc25c2b3 +size 559983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..31ecfd4ea34 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8488b48078b781606df24e5babc3891dc9caaac2cf4bc377ea76832369ce1073 +size 446999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..4420bc3a4af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb3c74e8924c77e6efac7efd9ef2c38add3d748eba83d4f1d5c1de5dd374c34e +size 448971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 4fd182ce1f5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7912dd5919f4021cf0cae6c6f4208edd74afc1198c841f01f7c00ca5e831dbb3 -size 556929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index e792858106a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:836e4538fd764fb4e25b3a999bb9305ca13e43a13a6b532daccff76dd40b53b7 -size 559987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index e0d41bac20b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92ad854ff1f54749e03b6b7c13b34c43a689292743cd266c25853a9dcae67167 -size 447003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index a4bfd69c712..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ecdeec591f730323d2dc4f70523a1d89f7d0fcd070aa4137871d5f091c7c4aea -size 449765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..e333ea009e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba9e386f1e77b2851d3c8778d915f0598977890c245daafa3032d25be28d4e18 +size 570447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..c6a8dbde7d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5846683c5543496a3dac2a92902f7d323bfa0591068dd8758c2ffcac3be675a5 +size 573505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..7e8a85d5c93 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c989fc4f6b47438ad8694f2719b66c7f0526de163383d9bb54deb01dda123adb +size 461507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..84066c63a8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cafa299936a6be1ad4f54602c75cdfd0c240967d68971e93be1c800eb69d5270 +size 463479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 6fa6e1412e1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cac91fe71eaa83240d937a395961e7956014b7247e4cd5628b03b6e0774c862b -size 570451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 8a09ec96278..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42c1582fbc190e71365766dae8403b74d2f4523fbb44baecd3b3c44a127b731d -size 574299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 10a5b3270bd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ee0da5cf1590872ef34a34fbbcaec8f38f321d289c5b6c85f927953895254e7 -size 461511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 091ca70d093..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff65167d3f8abc86a5b2ab0d73a3b4b955ba49f18e79371e29a4e9a67253e5e3 -size 464273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..cbb62b09b49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aaedae00df3cb56cfda21da58bebd5562631c3810a32ae265e449cca434b5bd +size 642668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..02e27964123 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67b719c384fff4aa3d73ab695ddb9d40b8b2da7ef528f525052720c0af800395 +size 645726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..869b3ae7fa4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f65be076e45227abd8385404ceddee2fe8c36be7efb7d898d6a7025a7bce822 +size 523465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..ce00cfb8deb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c533f1f4609292c7a43aaacbc708a5789cc98f922faeb4e6ebf637eb753cdd8 +size 525439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index aa5c2111d7a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48eb3b24927bc079e4ba0174347faaa23aaa82240f7b758b431bd155e434de86 -size 643460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index ea24d7658f4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d6e806cbb73b0df8f8f45d8afeda241ee69526c0763e81bb1123b2e5d45605e -size 645730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 51e42e5891f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe804e30edb266d40a1decea025c615dec51548f1320a7a96554daf89fed215f -size 523469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 6fdb99b1db4..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89e5d57371d1daf5a1d1d75e49dc5609efb8847b179698a9a4acaebc828a3f2b -size 526231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..a27719a13b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2dc543f2b3610ce7e520e070804f124c638bfd260bf466f944e74cb5499f43 +size 660974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..ebd1f5c51df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72133ee71c3877f12ef2a6929f1ccf49310a6dec2f955f645c48c9003b957eb0 +size 664822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp new file mode 100644 index 00000000000..97396b6ef2f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3217556e699f5156c78d69cfe2c3b21db6dbeadfd95ca0713db1ca229bdbe3 +size 544287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp new file mode 100644 index 00000000000..59b75f6bcfe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99dac529bbd0f4a2f8bd1c3eabcd314ea27ddca85877ad42c2e14efea03fa6a5 +size 546261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index addaefa0306..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64ed119823d8b6a8620d2071316ecb3a7e3efbbaad43516e8046ef5526a7e752 -size 661768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index ea589bd5b69..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01e235997a644082116bd5f8e63bd4477a766fb2af4340363cc3de551b02682d -size 665616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp deleted file mode 100644 index 4bb65740eda..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7848ad1d3c72a0a2f0b25450cf6d28a9012a319a4b03fb593dc22a3f06679334 -size 544291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp deleted file mode 100644 index 3388a02492b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_bA32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm103a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00aa1668874956d0b4e2a47c2f3c7507f6b99c5d569cea0e3ad2d593e51a58ae -size 547055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c96e54a7364 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a462f34fc3f18db05afa19eb08ba2ba0cadf782fc1771d1bcd1b7fc2d2b021 +size 892620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c95d81ac288 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:380263c53324599bb254aaf9da9e12fe4287d279c3b0c287c711136d31b3ee88 +size 744360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index fdfb8e2e47b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:526684f41cadffeb7d83218452289ef418f030ff0b241f2421a8a6212ebec05e -size 893314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8ba3ba641a7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32b8e2fc93930700c6436409670287d6467174c8e4c368dc65378849a9752024 -size 744020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..084442412c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1d1f5648ca6e5ce9b50f118b70de72b00bf7a3c375f8584b9adc09d38dcc29 +size 580389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..86cf07787ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6874a4e394d83373780a20831f7f362c069d03ea402a4e564d64e8cd51e00b48 +size 531055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a9ed8983f21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b174bd3a83475b5c73b017bf3e7553cb055854f274a2ee5e0c402854b59e197 +size 581865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 654deb6c0a3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7cbc425ee845d54a728b4057ccad43fe4f8633f1b2411b6fec5281ebd700b0c -size 585229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7b882e1140d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70a849eef32e6f2b2ea4ee5835b19f21e320f368dee7b19a56a2e48852177c4f -size 533229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0e951dd3144 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b55a88dbcf8de0a8133b66663b3f75d30b43dec9fac95d9f80df015f370d324 +size 916008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..eff4284ef25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cafaa8d209baa3c3403bf265df5b2472e17474a6634905ea4b294ac89ccabc65 +size 761434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index f52ba50e2b8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b9a9a21b261e197e585ff5bf9df1a73ed63cc1238796ccf25e063141145462a -size 915912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 734a9a2e55f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3d0d73673f3a2fc7f0bca1b9ded5bab8e00f7a81f314fd8f908a03eca8a85ca2 -size 761092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ca013e2d4ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19ad5c5646b774a28da3276b25942feb8168259a7303f25ac458a91437f517b9 +size 591789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..22eb29491c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:422fdb785b819bd9315c1a6bc93ecb22c90e12f478bf98029bcd67ce0c84280f +size 542405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9d0c5df425c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b0729f3a51c215ce4a3890e960e7bfbab7a05eae0e562b9f43089e266ae6e4a +size 594301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index fb931e1755b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:051ca20fd6f3c053cb97a2366b6d5fb60fcd8a37e06a1c4c09aa2ffe0fca48eb -size 591005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index e3abbb41792..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10f2bf91d2d56593a5069cf98715f68ea03217627cf6c99f1f95d3a180cea9b4 -size 543247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c5e970b4d7e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d8e4b1ccb75ac7a200b3de6797d08a0b882e4dc98ec7bf015cca438843e5431 +size 573673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..dddfebaf29b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e552db9033498a81f9f7bc00b5dee1209de2b46165588a374075fc40ec2d72db +size 448797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 293a6a8583a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ad1c495b8d2ff469aeb010493aed1ed58c36c6f079cf6d5a1e11b99d08f1010 -size 573677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 58e26744fdb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2fe4a2a71d8a5c16a00570042ebd893736d62d7869a4fd619593f6827783d8a -size 449591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..53099e53ebd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e2833d705020345d1aaaed4c61a3cacc32892d44e135dbd19bd221b551ae8ac +size 598541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c5a3719ed58 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7c3a20e301f873aa5dcd4e7621ee494448e23c7afe56077bbcce24b3e4a98cf +size 466709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 680b5450300..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c309b5a740e055cbdfebda21e45f7f0c827134869dc05e58fefdcee92cd401b9 -size 599333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 74d1c2e0202..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a8cd039cef6a0399ee96b8731ace84ff90a4aa744edd5ed756e29fdc45a8894 -size 466713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ca88234df65 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589d4b08fc5ffde9980ab12c9b4838c17529ca6b967e8f50a02c747d5dd81b94 +size 434891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..45a650b1f68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f003a3830bc67d2475a4fae14d0dc94d8c83d9cbe161549decfcb291eda8079 +size 423297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e02f3e2464d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e17dcff605060494d253eb8fa3cb19aa89d870e7f8e0a559f59d37c8549d25fc +size 428079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..89f065b5fe3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:862f7a25a606ebafa35ea4418fe57cdde5bc1fd4045c2b58fce56e339eff4e6b +size 339567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..73809961446 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e883dd813d9a10e86bd3b8de433419d5b45905282637b8fe6bbe17994967e93d +size 330093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ef93baccd37 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ace73e8c2db0fdaab97c724c14b8c3d396daa0d127e0f5234902e69f8b32693 +size 334087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0a5d12f42c9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f05fb93984a1f7f64a83c9dc8e5ef31bf644f795f5fd7de4d4a4723ce2841c2 -size 434895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 58a79480e36..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d608a60d345a96a0bd7d52601333a6889e45ae02e18858ec8fce60b3878b6af5 -size 424089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 636728965b7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d97a2df942fad9a66e3fc432281f707da7bb47b76a4224e63d214b9de08dcde -size 340361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d85e760be15..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a92e8ba843df8aea2d7cfbaae4158d39293d5f53ba354d2d4275738c61678f59 -size 330097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c4ab3973d41 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:201b37fbfd00c47253d1b6616a4d876d7a230ffb82e68af47422a9e09ea389f0 +size 449499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..34b37a407a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e7c255f731e17e7b3a6393985ac1168485b30f1e3af0759a3ff61c3ca9c162 +size 437953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..37b3ac75684 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5861de5b765ca5d8cbd7b5d6c3e585d48210ad326065b011b940f19bee549f2c +size 442687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ad3653198b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ede340f9e2c4665bfb8d3f4d72823ad70771e3e312a803746571cb0330a21fd +size 354125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..23310581da6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c1c4e3bac00aa2105cefad42edd3177649bba6b561609faf03c81352017ae47 +size 343911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4e98dc40e71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d206ea353fae1f517be789d3f7d5d934e9fd8060b8ae3fac7b4900f4997033f8 +size 347905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 36b091edce2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd2bec22e11a92d1791b7bcd88f534c8d2bca4eda72d2fb75980a18f28d14344 -size 449503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 26a384b4d37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7896a31a37e08ec4dd4b8a67d5bc2bb56f29903036f314ff47611e09ad3bc22 -size 438745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 353b68182ba..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1f0bc82b7a822c0ea98fd3b35cbfc121f54d5ca2421cd7012e472265b4c8760a -size 354919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 632e3968917..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:312ee5a9aae02acce880e84d643b9e714cbd2f87ab2d053b7a4533480a43d4d7 -size 343915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..74a9677fadd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9056068cae55a4f1d3147e2456f2d49c3bf557ea4cfdfe18baf0b6842ee74169 +size 662530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..24814ef57ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e47c8f4929691a45897eb5de12fed04165bd871c430e009cbc7853a4391b39c2 +size 593411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5ed3ce83e4f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32d437e1fcb12c58a4398d7ca1f29fc9ca52f9e500af22e143b906adc9839c0 +size 649898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4d971d71c60..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca89604d7b5f69f188e946f64d025e72cda1e0422ef5dcb82f36c84a0032554a -size 663422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 59a62109b7b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3915d45f62f5b5957390605d46638655394b0e2a19c05e35de165c878a91b09e -size 592923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..65eba942e01 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ed653805053f7adc7a0317beb952cf00f9279b2968d2de4ee4d390d38581002 +size 673980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..27698b4c540 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed4f7851454966a7475123436c8816dd498f8e9c418a0feed536ee12cc6a044a +size 604911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ac627f21037 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a778f5ce5b1d6fa7643bc5f4f75c9f3b8084f38f010a470def105c8ac653b74c +size 662186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index ed3d837118b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:26851e11c1edb2e66c7a356c8aedd02260cc8d23e29fab29e56ab0ea3f389565 -size 674774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index f1ef4b8937a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4cdc63e768e4673c005c50ce7465606c1d2efc73629b6c3ba424b5b6824e30c -size 605753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d037da3d01c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5459bf2c334c1dbd5491cb2cd69aee20681b699d32cd03400c9e52757472b590 +size 601809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0fecaf6c9a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97227f75807cbbb77686dedd29551fba21e3a275334ff26f1e8f136109acbb16 +size 549217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..75d926bc055 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1db4ec3abaa48f56e7c43e68e080a97ae066c7267a607ff184d201b3a0bc296 +size 587399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index ec57f6f2268..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8f7e401af3e71250a3d464242d9d3312d1b4234de9e43388402e94bef45dba6 -size 602601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index b0e59e6e33a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:44a69b5df55ec1e6f5a236d45d4780ed5f0b13108b2b00287ce4642e7187fc00 -size 550553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..75bf39c23d2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63eaf2b8d12ed6fe88acbdf2c8cf3d668a0c9b765bd105609eebc571638b859 +size 615033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1d00b8a8a1c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:633bd5933f8144fc40a90fd509898f60ebf3f21dcd86a61bdbab7149c5367c34 +size 562689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..92066de4595 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28aaec101d4aa36d29c590c4d26563252b4fa66a63185b0b3c3375720eb6ed07 +size 600081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 0b369d58e03..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_eW8_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3518c7bf7d7e6bd49b4f7d5035cc78ba2cb41a6dca5369a0387e650da521d20d -size 614101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index f1909848f52..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_eW8_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12b8570595eb8a4601ff078ec8fb2ceed7994e1244df7c791cade81f9ec97726 -size 562693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ece763a4b3e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a259af922414f72601a4eeda15f418e540ccaa219ffe5be118011cc7be8bc58e +size 624930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..07ea0cc883d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2d697073f43afff1aa826d838c6257056618ab1fbcc07eb7864ec420f603127 +size 495615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index a0767eaafc7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00ffd2cc40037c680ac86b686ace333e5e12043e83b1cfa23c55bf3191f79782 -size 625724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index f5a4f0e964a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23b6a033425d34958c14fbcf54eb3bbc340229e1209952dd7aad9ee9d361b83e -size 495619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0c4c83cdef0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5a65592ca2ba8b38b4e45a40f5bdefe10f32a75f126a2453f73ff0b478290d +size 648614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..88b408893ff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a233d7eb32bb79957aa3db6074c67b516a2079081a1b5c930d94ba0c54bd477 +size 512885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9f9ccfb91c8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bde71729c556dacc9ea90b7f8a1130cf61fd4c5e4b8c3a732104f91feed3c7d1 -size 649408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4360ae460ff..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:758aacbd1344e8c6fc7c250017b9dcb5e5c9825f1f87cf356b830f9791160780 -size 512889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..baed2113994 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd2b50acb4508871c43ca5697bc95632c895bf0dcc106bf3f08d524c7f0f78db +size 454379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..603c8528d81 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff053332c6d0e4e7e8cc1ca8a3954239a7f6b75f483363bf16e1841fd8e8215 +size 437653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..21958ed3264 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719b4eeb3b32e6461ceee5eb6d9095cbc6d0b6dbbcfe2d9ebee93375d1c56a58 +size 447171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1428f481631 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f52cbc59b0b7113ec8b9878b7021e606ecb5f28360f2e9fe03123692ff18ba6 +size 358363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c754679dbc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2def776b52c4842e996ffd6fd5410e7d53b006142ed5f4093b53fcf0b8728ce +size 343759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..96abed2d4ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf6bf85226fafc37a3a60667662011583ac8449c559ce699b6ff40b15d9de634 +size 353327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 33c6070b271..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:13f3fe2eb02f905f8343b704065b2c05a15c7b7dd9c65e25b99d6ffccd2780b1 -size 455171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7ccbe3f47a3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:65dde9c5c1361eb2db5183172f2f5718f41df46d03d7b3ac8af3697e4c5327fb -size 438445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7795d202edd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5791bfd62d436633304cbca36be501f14a7d9262e0a7f0264ffa538167ce1102 -size 359157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3aa2c0e67da..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:331403f3d4a1530ebe2a6fb1f3fe67f49a032db5d2b4f71191e0364a3303cd9f -size 344553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e1b00b09f26 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4946addd472be2a75394db5df635af77341a08332274e65bb2ec2c9a5b7fdeab +size 468245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..507d158683d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d87f6aa6ea63ea2f2a747ddda6d67579bf2b64d3c2fa56d26dd44b92d657f3a +size 452259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1ab5aadb88f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac73afe3be3f78f535b41d45828f53ac410931cd861bb7a0f0954d1f6cf38fda +size 461827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b35f0f6ebb2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41445277c2bf6957f607e4018b0d08bcb2c81e7b6ff17cc1bad79f92adfbdc41 +size 373759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f2661ba6d54 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4b96a4324c33cd646cb4b81f7b15867cd6feb14688faaa13cf1454b7883488 +size 358365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bffe3383570 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc5c3f185a7d630077246772b4b8c72e1dfc9f71d5d7e189c34bfbd071547bcd +size 367885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 78c4cf16e1f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a925cd05c89a320013fd079a7b42b33365ab8f8cf49416a6203ef181c4ad554 -size 469039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3f8f4968241..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06ee7c2be6d573d3768a0656f21ed450022cc100fbf9a8046ea45538286fc7fc -size 453053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b0bfb0d3b6e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a66cb8117d8c6f9443a99f8199fcca39c50fa97a4c3f1ca41c68de83af28b5e7 -size 373763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index d3952a2b008..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b494edb9003f0ccdf0b504799cad4d9e97def2645a71308e0c30b857e5b8a5f4 -size 359159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0a4cdbf2322 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94124d32592243dc83eee7ec5e966f1a3636fc319fea9da9cb9ed5c2a32b7425 +size 689952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7320b6358ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe3f995b418418bd8748ecb81353c40e73f36eac1c0545beec531cd7c2c1ed9 +size 573069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 199094805d1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a8fa50a8c6c96df653ea7c4e379c411cffe4cea2cf39c12b21a76a75b4c3235 -size 689956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index fb81af5e24d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a553dfb9045aa23ccca989de33328708e67cd0369d18b6968e8350e13be30c3 -size 573073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4cbd6f11093 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5d111e82ccb3183d1ac73f4f08e1c96f29378c6edef290496b2f056a29b5b2 +size 716546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7eb1b332c25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e8f13d8dac7b0a584b8c6282a6faa8292b14e3bb83703fc8ef4835fb35c51a +size 590191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 33f7e64c682..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:324af3ddbb3046e9493bb2d3e95355b2b8e6e3c1f1db077be8f047a1af5ea65c -size 716550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index e6b5b45d559..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16672d76149e35c37a21ef4ceca9fa0289ec4f33f4fc0eda45ae0f8ed927683e -size 590195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bbc49337ff1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91e67ad9dbeebdd76cbdd541153a27888c9947246d218261a533efc944887bfb +size 517337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..cd8538ede62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:025cbc34cbd3cf150972834e332802cc91da0a14325c74e6a4d4770eb5a8f93f +size 483147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0d59ae80249 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f65948ac4284af1d7a21fade652ac3fbe5c5a567c2cb7506e9e3bdfd525ec90 +size 510329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index fcd52c8431b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dedcb246712517575e1a67fa470bf5ae89850c0beba3c06e1bfac6271bff9405 -size 517145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index c917860bbe6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62dcdda9d091faf83f10fccf7aefe2cad9cdae58b76d068b6eb68790a38be9b7 -size 489565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..26b478f60e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8056dd7bf522a48f2377c0f99633468e8699a32e223643fa217a2bdc6da6987 +size 532981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..abd9c0eb902 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:410ca3723462a7b710c8d18521e45f87b6d9946fa4f088719026fcf91337513c +size 497655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4535076c91c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaa376a4e5212397386b628ca208a67a67fd1b30a89287012fafece292ea37b8 +size 524885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9a69c16b338..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:126d6426f703cb20e56607f701dd1697a8d64ef040d2874829c18b5251e02768 -size 533773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp deleted file mode 100644 index a75f808fc91..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_lbW8_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88eac519ac2c242ae239ebba2fa6d8c741f5d6c4516b10816cc5210f2225931f -size 498499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..19b106812b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c432113c52951349c5c2465342899d03c2bdfe6136852d57df8474e415bbfb17 +size 415995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 1a9cba44d7c..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ade7457167307acd98dba6abb06784cf89bf6937c02de2dbb32ebe37b115139 -size 415999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0bc0bc8ff74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac8b90c45b6140bae0a246ea8a0d0afa882e86ee6476192c51545a6d85c2175 +size 550035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..96a29aa8f5f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfcb15c2f8e38fd64c10170ace7c9c6562f33c4ff9d606eedacf8d734c550399 +size 428565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 294bd5221ad..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4bb4d4c57b1569929a3784c891165134768919755214c8580f73e952e6432e97 -size 550829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 09c4b1c4395..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe6491ab7d5a4575291dda2f641ee243d81f7819f260e1d2820ce53d89a7b8ff -size 429359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..c362afb8d9b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1ba03b73822507d34448444d00e93d198ef9798ae20a087a967aa499ba8a117 +size 433215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 4d78f8d8bbc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bce6634b17055fc43628210b96092dce7d7a4701c777294027de70a4c04e8289 -size 434009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ccb1dc62be6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56ff79d049d4b7e02afac97bac74c5a6bbbd92743c132e670b2c718e675c004b +size 574065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7b465ec2698 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d94c807e348e7270884a1951c72afb0a330e4e07604089ea8b331c429f75c3 +size 446575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 84b5d0193d0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schPd4x2x2x3_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:581a57bf7e3c36b2ab952d86f0630e561526de5d4ab13752feb13de9c1d82b3f -size 574859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp deleted file mode 100644 index 30c943deea0..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaSf_rgTma_clmp_lbW2_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00aa83431d7bf145b9fdf4c14a6b9519edde561dbdb8dc14883189780af9e943 -size 446579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..31d6abb2abc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da8739c98dc3042e663dee416feda613bd0911b92527052bf917b8a21686a213 +size 425759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..da5b89784b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e433d084eed732d9f9c3ccace3b2d75b84f4b0e6d335602c999c3c42db3328e +size 417815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..c9b43d7af79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4bdfc9b152c36adcb0a72045e7f2853c9c7d73e0c6383860d1b81d5dd3c3b8 +size 420329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..44534baa9a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e629f56e2fb5f35f6f0780a22d73d2d284a2aec4283be6b4970682cc2cfcf00d +size 329745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e73134fa3df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:643a84538ace3ba24151b6c23367db623a2862a18418580f9b745b3627730f50 +size 322983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..69f0e27df06 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a38254634481fb65d09004698074b944a383c7cc5dfd3e4d8bfab34676f0ed +size 324561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 061cd4c2f89..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79f7468d46701f5ccb9388499ffdb701e334784c1b601c9adada021e9bfe1978 -size 425763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 972fb4b34ad..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64b3a8fb41338d911173070443ea22408127ab7250bee9a0bc9cae6f72b66a72 -size 418607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index cec6615968e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce59f29647046597d0fe0c7a16f5a0b91bcfb27342d996640c50d6aa0101a2da -size 330537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7b7ddf9dcb9..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b159d8e5a58f315e3ebbd03fada70a0705eff784ab4586625cfd9bd40c9511e -size 322987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b204a8e9076 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c68450ca2893c342b3f67929d6504384ccec8e179832efd5f31007f110a78b +size 440267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2dbacbaae48 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c227ca255d091b0a7bbfe9c6a5b49b5fb383042466b6c0025ead0d1bbbebaf67 +size 433161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b72e58445a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab4b953ab070b03e36ecf0a7a8b7c962b5b9fe6e6da9145b3100642aa9a3003c +size 434837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6fb4cc6d1cb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49423b86b73b08a2b5f4ee6f6910c7d0fcf61ebd0d0b49c5f7933bdcd59dac2f +size 344301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9bc7291b183 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2fc3211b777b810badccee30eb6d884ed9259361f2472bbe1939c1cabc468d +size 336801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..abce39afbff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_silu_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29330f7b7d24f2795521ea967d11b1ee26dfa9a65179e25a213cc4720dd2baf8 +size 338377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index e668575f218..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27cee5a328d19df6a3c6293c5fb112a560a79f053fae15c64efbc324a1394fca -size 440271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index 20fbe2abb17..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schPd2x1x2x3_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0f88b6f08f66c1cd2e1c7a5849043093d68ffee028331a2080ae50a5da15f93 -size 433165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a073f3cf51e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55dcfeccb79b49d11d51e52b37863e8ed079eb887fff093efc9313e0f2c29195 -size 345095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp deleted file mode 100644 index e4493836b48..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaSf_rgTma_clmp_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b565e1f7b60c24c2ebf395de40290a23e298da305c3cfea69c6ca1868525ae2 -size 336805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..e3833f96e00 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4feb7e0caa4866f815c9ce047946faecc5093505ba05bc2b823ed9fecaccbad5 +size 320165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..8b0a0a0c747 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:012e57c2c21bff47dd7db3548a1de5189be60b83ff0d24d39a517704b97569fd +size 324321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 88c2615d0f1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a855aedf5af80fc2783fbfb1cd11f4cca11a4a0f83d6caba1b36e2690b27ad0 -size 320959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 4eda40202ff..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16daee862e65da33eeacbbf286307fba3cf8346d61ffb0e365f088191c33d254 -size 325115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..eb0a8ed8bcd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9b4172a3874ef7f0a7d22d82fe59db26a5f05e58c2b8960e27dad1edb92f884 +size 342763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..f8db579a011 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09b3a6782abe42635191c0990f0d0edea2b60a84b9bd65d66f93b539ea37263f +size 346969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index b109a9dee4f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f08b2ca8961f089842159d12e51d3bcdee753dda8bd664fd8589440a6a7afb3f -size 343557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 8bee01540af..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a2e92502ba7c4bb835f0e1ee6697ec87efa3aaef4d11c5c633d49ebde811ec28 -size 346973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e6a5f124157 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49995c11496cbe78ccd562bbc73eff372206e322a115e3a6348055564ce0e247 +size 532431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..d967e3a87a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f5959aea6b63d8960772d6add103fc2949456e6fb90228359bdd253475a780e +size 419791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8f0d1d7e031..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:54268fec41c88f4d872c0b00226b17b0c7ac082900b21c58aca94abe6c3f9669 -size 533223 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 58cd78c49e7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb83d45c84ba77baa3e572b1161f7de2e188086f6b5cf0cf41e936ec8ee41479 -size 420583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0eb7978b6e9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550cd962cbf31aaa36d21e9bab824a77a18cf0ceac339ba0b573a556a75ac39d +size 553007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b106a72a99e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b8398a35ae64ee9522bfe4c76ab7687d4b45d9335a24e0e580dd806334a506 +size 442043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ef049ebfa7f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:298302772567ef50f1cf969336def0529dd62c47ec6608ed0a3cea43f0df7e0e -size 553011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1fb1bc262fb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33fbd9ab1822ab97f466e2898129534a7961cf1f59228bbfd32eb815ac25c75d -size 442837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..03a6ffbc013 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f7dbf5756b28e59b342057f2a6c22187a0c3c8835b952366ef74a661f36dc30 +size 551917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4f3bc4ddb7e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31901daa015d307c08c5988f119bff2366a05341278fdbb034bb4c8d360dcf47 +size 440955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 905cf636dd3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85abf7067a876e6cb331b3805aee0f97777d5c7dccc93098ba1b823a4e4b8844 -size 552711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4b5d970c8cb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d896cdccef267fd549c7401f6e4c9a10097bda7e99a3008ef4d454572e98f1c -size 441747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..745d30c9589 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d52610f772442634c0ea7b7f0049cea91bd51efec9ce0209e1aaf779d1182b3 +size 575107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..01b622aa6eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4213f96439a240f4006e58d227407dbfb6eb7550690727f96afb5dc404d97418 +size 463207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a6e1d9cff37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c26dbda7a6f04051fdabcf95f23147de7a002dd0a9b6656a962db980258a3463 -size 575111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3919ad08e61..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92d2b083e69d35bd4ac204d33215858128e5fe3c73c6f2273feb039de5332b6d -size 464001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ce07e49597c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56255020c5249428838b5393c04e1b03df65400d9a67c2b011960202a46d1557 +size 610771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..df1d7b4f097 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17cef2912e3d845f8ad1740421bf2b4d3a2b9a58af55e2e4970f8c34d2092da7 +size 478299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8d546414a46..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51245b6e72a51da03d5202fe84577e982c32483c505550daac1919d3ebed7e3b -size 611565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7471e145b6d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6c5f1489eae7675797329c63b034a0baa5be72473dcdd2f0b0454df63dd780b -size 479093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1239184b9ff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:516acbe2bbc741569d9e98958ab8b4cea8cf109031d345a63fe83425a264adf4 +size 638650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1f3a401c451 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1a339ab0be57dc7d01e888d137b9a7b153ca7e9c52df79a68a10660c599fc5c +size 500553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 471cdd7437f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf9bc3ed0fd96eec359855805a98d09c414cc0282cae03b8655a525c1aa97102 -size 638654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7fea5e25936..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a396d01af3a15af103f57ecbe79403c507fccc5b56262167f3177ec247d82934 -size 501347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..308091699e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f79f7cacd6c9b3cca72644ad32171b4d51c431615045173d8529ed0e289c649 +size 520781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f9e021c0111 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96c941cbbdaf49fe22cf1c5cdd84749334b3bf02000ee896ff5b0504d15c3870 +size 408981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 9bd7f23e38b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:214c6dd737a8de9aeda043195d06ab0faf2c69d84abba0879e922658675c2690 -size 521575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3b5c8b6a247..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:59ce0423f423e3172040874238859c6f83b4fe9faf6688c742756368a64ce976 -size 409773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6e6c6524417 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f31543b025a2cecff4c4789e4cb59ba03730b79f8b683290145b8d6abf62648 +size 541703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f394125ddd7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b9ef5abdba97bfe51b0690b84799b34237306c16a7af873152bfd410ed39be8 +size 430445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1c689f4b891..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a083d3f8b34b6722aac0bd76b990e9f4c11862f0db26734f2c4895894d0cf540 -size 541707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 83fac9d2e78..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc4c66ec1f9c3a1e00b3bb80388183c3707ace82c4e5873a8b1e06587a6f168b -size 430449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6166ef32d92 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aea8139d53fa7d967b93936d98985bb30c2f149028350b8d17e9a5915c702d1 +size 523545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1ed176fbc53 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ff3bb0e270f5ae139d75851792926fd28b560ba675bdb6912296201aaf2e51 +size 412285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5bf17974943..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3eb4c544753b1e3540f6ff940017766e427f81bbcc9202615e712098134b9aa7 -size 524337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ec45698f91e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c19fded839bd2db8eaabfa526ba92747f515d87259ed1c92020bddcdeb928c8 -size 413079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..187a4670bef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad0edc2026b33862d4aafc31d1a46f26fc2549c59479a0e70f48caaa716ceb6 +size 548561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e58f9691547 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa447d978144c27b193d9d2043c49401a0676cbce1b4c41af9735ff94a914880 +size 440903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1311d3b6bcc..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cbc822d64c072df485596d7dd4b328289e5b17e12c4c0afaebf16303a0fefb1a -size 548565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3cf07920e8a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1200dbf28c9aff0e13ef361e7dbff59712eac9927b03325e0765c7579142287c -size 440907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..e4c8bba073e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3a892bb7e48d0fc71e4883a16b83e922f1ec8e0aa33e593df0be9ff6ac330b +size 416791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index f75cf5ae8b8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:29617a1392e2d9cabec20dfce3dbc5ee8a5e64f8d749c3b4aab98a1295b2af34 -size 417585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..51071ddbe9e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01909be85fe25b0cdcadd3c5e9a9d1b33dbfef52fff65ade954a56b1572c38bf +size 440723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 75b533f4336..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a3aef0892235ecdb70d1f3ed387c37969f7119bde8fafc078e042ea11d56f22e -size 441515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..8ced2e7afe1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acf98ca04862901dd33af74179c414822afddb479d9dc01c50ac79a559c2db08 +size 400257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 66d6cf21286..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:649f0385e6283ea9ed3d38dcd01a38309ff7d96e0b682569157cde82ea831785 -size 401051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..5ccf97d204d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_c1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afbf35155b1d3dee42c6b2a2be89e7eedfa51f2d34c3ddbd43b50e30881fd7a5 +size 417479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 6e6b38216fb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10d59be965510258d34601bb89d2b841e1fb87e583681dd105e89c7fd2e70461 -size 418271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..12cb51e2aa3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea7556900e38ab67b1c468a9a8a77a477c10682df4d5ff05c65f516af089b51f +size 320955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..df63a6949cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf20f00ced999b623d173c8604cdab128a3a0a2144bf03d30d70221f44b9b95a +size 325111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 31485e566d3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c5e8cfc58a7325a33a82c6eeee5a091503ec112bfd0d3707a1d4393259bb21ef -size 321747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 0bbaf505ff1..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37cbe4331afc350757181df86269190a1bd6630506a055bcf4c0a64a02c93feb -size 325903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..02b8746910c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d85c137dd80f2c42eea8a0970c23bf94abb57f19ed1eebd28cc5f71b3c4f83 +size 342763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp new file mode 100644 index 00000000000..f9efcb42055 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33f957aa8376615ae9726e303a45831940b96ee5bffa23932476b9ba9d3c451e +size 347759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index e2dba12a2f7..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ececb610b5c12e738d19afd52058a19dc98eafb20c0bc9f32d600b8ec7ffe80 -size 343557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp deleted file mode 100644 index 9c41e26e4ee..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_rgTma_clmp_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:369c9ed30970016d80c20ad831e768a0761456af2e40680c164cad5a63c8803c -size 347763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..991e2196d74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9e567ccafb2a5861f04fb67382b06b9459664dc62c7433a505c61593744e2da +size 831558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 17cabecc582..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e291cd6a7241705592a3d7385b646818d26edfd8831edd16ba3902edef8e803 -size 830722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6c7139cdc66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6871ea837aa9789bfd04caa07f3ac537454398963e3eaab43851228ace362c71 +size 854254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 28232b254ea..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:211a605616adb431a3ecadf4a060f1eda33e9f0c423bd5a0d9c0529a2d65a242 -size 850262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a6ec9d28fcb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d2d112d7309242d76e5da49ca2061b94f4e56143a7bb92566c8de0b5eb931d +size 834222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index a35a93f4c19..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4904a2711ab04dd79cc73de2e8d1de55445ae0bd8987c43362f44974473fcb79 -size 834176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..a492aef7ebc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ca4bcc23e7eee7a88d43a2259495542bd4b6c7cbe42962103f598c6c10e0c8 +size 858694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6b2e8f5d863..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1be6eba16b7fc04c17fcfc9c800f7b357f35d058afcae78e33ef952a1c1c53af -size 858846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..2fde935e119 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bddf944a455f3a5c7188a517c49f501d77ef702730d30568ef914a734be39105 +size 617125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8dba097c0da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860d22d193b7ee80cf280cb409764405dda082d4e6f082a99c3bc573465f93ec +size 488107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 7a8839c2d11..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6ecd4c0f3aefe92debbb346298c9d5b0f31dfc8a7a4328f8ff0420cca7f49c8c -size 617919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b1e00a41866..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6089702b5307d5a8f80503dea35a5cf77c386c7abbc137cd3f2dc0ccada52f98 -size 488901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8c8e0459c98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:763cca7a183e637a4acc5ecc4de86ed556e1d97bff5679d864e43c4af9a3156d +size 661428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..ac67aec1aff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aab18def713270dfd87e1fc16155d5848c71152319e9c463f1a8f0c393a1c3a +size 543509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index f8778483b05..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8a4eef21d1f5f505a9c7b90231e6c0b6aac13fa36679ba2000636e64bd1f086 -size 659114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1646455a379..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:056d742caece8597db1ed575686c42eb69b01d20de7aa3b352332fa11ecce150 -size 541539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..7b164c6d62c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:872a444e6de61e1edf9fe70a385bf44c1f99445a4fd7f71a624f9c759fa57814 +size 640614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..f971bf6e079 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca7b11ea1d9892db7a5941f9801e4a3cd2c4de469fc13bfa086449ddcaaba621 +size 511989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 06bc162f06b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b91bfb5eb509f688160a8a31a021e95402674593983dd7cce9075cba60365868 -size 641406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2c98a225efd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d9f241eb393193779d4d29645d0d76fc7c3f9980b4dfd2a5f440e4f88d097ff4 -size 512781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..8529643ab3d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c0b44ac1d879db002cf0235d71d1bd64931e2cc8fd96d4c37120cbf96cb74fc +size 682942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..91b93e4fafa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70c84a354ed22d0cf11f323c3c1d396ecef0d72e2d3296da71666995da625194 +size 567439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 4870829d9ac..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a76156cbf09758c97d47001e050935639763c4114f50471be303dabf2d3d017 -size 681318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ad8f18dbe0b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac03cf5559dfd1db99badec8ee50e3aa86d9fc2fdf6642eb3d0171afc691ef82 -size 565469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..bc062002c4c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56783775148cf30b8a048014d95e91953126f804a1ca83e0a3d068c81de27ef5 +size 1109756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index 677d17f172a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89598f846c12882410f0453f57f7a0f2a14213b297046674364d6db252d533c8 -size 1109514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..4ba5a9ede0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_c2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0db342b69eaecd8d7ea7175105c3d10d458c884d8db044a524fa0e03ee26263 +size 1135754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp deleted file mode 100644 index e9314aa1e37..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schPdx3_biasM_fCp_tmOv_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW8_lsfbW4_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a731ec3c224108e3e137b7b42624c5e0214a1faca314b36a8adc4ee14fe463aa -size 1139162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..92970110c27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6638acb7b8d68dd72aa434152a316ec7364e48d0fa22372906b602d91cc005df +size 655854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..48c62236297 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a422c18bb2432853389426d9ccd41e8187580ae49c22ec393717ebebb251d343 +size 526685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 2d77561debd..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8294f0a3f32657c4b6ca95a41fdd3a3492c7eb52e37868fe9ca01c2ef85b0602 -size 656646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index fdba2629d84..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b5e28f946891a59746511c7621935b1dfa43e1adb1c59e84df601560a353fac7 -size 527479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b2bd90bb3f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47bfb33604cb45ce46e1e2ab429d54630a495f9227497e26241e52983a130fa +size 692754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..b0f32f5a4d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f516bb3b95350448023ff936ebce3254f12de61a9039e00a2bcee56564d60d +size 582285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8dfbef9eba2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b6b1b5d6bdc072175f06254185ba1cb0eab03f56c9b892348607c9be996b5bf -size 690834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b3a4e74b65b..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:809f0571c44a92819b81431764a13177393d5461ff6708cf6d450c17a25d754f -size 581055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..504e51281ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:783fb436aa76871baa0a277f1b035caf3290cb34012962b643e072973c39053c +size 679242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0474121e42f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b45b3b5b53c8c5e84846bc3ce88a2c3e2af0e6e7be2e906b25d2cfc4097e0f7 +size 551357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3c0c9a8b08f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06f9f6a6d09217f1b1fd6268d13c718bfa14239067b78239ec5aef735e5819f3 -size 680034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index b2f339dc64f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5837bfa375ab5e529cd1630769792f6d335b8fd7ad690de830aff6149b61e897 -size 552149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..cbe9a516f67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ad3e72d06e7554cdb57a121d5d922082702de3743382b7acc1c6d05335445d5 +size 715994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..74d355f006f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_c2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4213c2de32468dc9bc2f17a6cf64e2db0b35ecb146a9aa98ddac1a39d5cb0d74 +size 606165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5d22b4647a8..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8bca5f70c58ab1adfc60c97a6d337362ffc31a9e0120da042b91c30a7a7091ba -size 716690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 56313d9d2b2..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:43bf6a8a96b2640d2cefcda6d8ee573129e5a72e6c4b7180b953174c76c7af45 -size 604937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..747f0363192 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ec4d620edd090e5c07c104f286aaf49d1e0a4d493dbbbe819f3df4466aca4e3 +size 750696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp deleted file mode 100644 index 935efa96631..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9bd4e30634eedce2886237ca75a6d89cf4377390910293808d37bf87f64858da -size 753710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..e23f563ae56 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c21209ee766957676d0dc53b13d539eeba7a1e737b0505f867527a0665ddb17 +size 775120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp deleted file mode 100644 index ab070b7c301..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b644888468d61bbb6666e036c5a3f24a7d536cf82f3d93d133202992571f9db -size 772806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..cb123cd8788 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6700da878680109e7f12ffd877d49e37b08ec98499059e8e6a6515e0a9ec97dd +size 792530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp deleted file mode 100644 index 00f56f5896f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa7b778235a2cf818fdbc38be6f76be437dae3643bdd44ce32fa53e2134124bb -size 793028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..1a9cb217e0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_c2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca23a4432d274c9713e1178bb9bed8d1f13456a724bf382fb08247a834afe486 +size 815474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1fb84fd3a59..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_lbW1_lsfbW1_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:291a9135a62ae4607f42b402021e27cad2fd600ba6de7f0f11c3ae8ce2b1ac17 -size 814542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9824b0b3f28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a675ad725e77c30c4e7a323a636e941dca52c2db8559527096942e1e6df8f48 +size 594327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..fca631a89f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5f11eb6c0b65044c1f81f4096b8c7dd8631520cc1ffae133db9ab9921894b89 +size 481293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..61123e208ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7465cd5b944c1a554c6e8634c27e67c66d2d5d98f6171d7487f2257f24c600f7 +size 693308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..5327401625c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15fac533d241fd8d173cad175a8d4d675a36eecec33a89798d8cdff1b907acb9 +size 539819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 016d779d4de..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66f927d410f569a25d513eaacd6f70b0a5bee3a53d5e2cbacabf167aac7b1ccc -size 594331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 6b499e10c64..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dd591d1e19e7624c14ade126069eba3e44cab22a6cae492fe21595c49ef9ccb2 -size 482087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 3dbe7e3c5e6..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5b95fae5000e87d8bf7e91e17756b3b713b47ec7d69ac8a2ca8ed82861d6af7 -size 694100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 01f450841cf..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5e52575b8b1127488fedfcc6647e8ea916ed5146ea83d13709576b98913296c -size 539823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..27eb2803b62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a852256e8857d8a9e2d644269d97d6e0b98d7695c024dd838cab1052053a97 +size 620676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..45c42874540 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:442456ee1da37a612319bb47f6c57769794a8472df6873a8ea61d9c8ed941be5 +size 505767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0a85950c002 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96278d994f3784eb8d7a1206e4a04aeca104fdd00a4d7f27c0c63ef482967e9b +size 716400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..3603b6ab4c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19d373291d7f87412b2c539d3bc77d47da0323436ce5fcc18cbc29c2f1334ff5 +size 564587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d73e263f70e..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22c25c6d9424cb4ec7375b7d8bfe8b02e9bc4264b884a3099f3e529047557687 -size 620680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 79d23132077..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d8bf59455ec05ecb70753f6a7660f99a0128874173a51e9599f3cdaa858056e3 -size 506559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 02ef6c3769f..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd529dd26745718b4e52ae91c680c03cde6893160f6ff6e7fc29d2dd88d4ca9 -size 716404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 5acf00ad3eb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2b496a1d669776890ddebd78cfb97eda189fd2db4879534d28c902476097ddb -size 564591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..12c11d9902c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99fe2e8a8e8ccb75e75afb2ca51249c6150254dfa52395e4d0c8ec6dbfd04904 +size 593095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..6793d69e5be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9426de36443e751ed6d0f995c455b547dbdc9064df1b9efa3882e0ca6452ccd7 +size 475127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..39f6d58519a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fb5879e4f17b68e6a0f5a87f95aa638de2e75de15dfc915c2f042eafbd9c7da +size 684328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..56a5ad062d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f0551c4f64c40da735a443007bc087a62e389e3f3e1a253942625f7525f788d +size 534195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 8b7b4094038..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f0f7ad67622d852200644b06c9c7831de89d12504ba1e0a7bcd710473296fde7 -size 593887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 818c6704d6a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a7a0518c231c51981c51f20aaf5e045f1669f4ddf7da50cfc857ceeb874c4a3 -size 475919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index e83de87bfdb..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:59df568e46029979013b1fc17349135db449f46057c3eca4263dfde15ae599ba -size 685122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index a6cb177e2f3..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f1463584c253e0f4791975bb0275c188e2fc6b0fb671ce5ac05b006550d7e6d9 -size 534987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9b13ebf0f9d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1b8d17dbeefb8ef0fdcc6270e152b5ebb069afdd36c78a46e20af3a76ec1d95 +size 616483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..0c70fd874bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a6cc1cb1e00e86c52b5d5bc4790567ec847d794f7790e2a68cb997c0eef20bc +size 505125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..9850946f672 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae652545e98b56be68e1268f80c4ec1a0f887a41b5e5bb400d87e2e8ea856256 +size 712304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp new file mode 100644 index 00000000000..22fd4df6c7c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_c1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c9f73acdf5dc39206fa9ae1b92e4c4f4e45bf9b19c6b761edb860af9d248b9c +size 565131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ed538dd1f6d..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca797a09f7cba755766c62dee60ee9d1b2849b8a08cbba1efd60b3331e790f31 -size 617275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index ac603e42e9a..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51189ce4fe72ab9a4a9bcb90a798e1e9e9c8cc7388b3e2593347bcf909db01a5 -size 505919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index d73f19c11e5..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schPd2x1x2x3_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d03954e0e9d6d51e16cdeeea4632d20a57dfb6579614af0d71e8f22f40c87eaa -size 713098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp deleted file mode 100644 index 1c1c6024647..00000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_ldgstsSf_rgTma_clmp_swiGlu_dynB_sm100f_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1be8d4dcae4b6cd618993a8261c9fc11456146b40187b0fd4db48f7412d310f9 -size 565923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h index dba18f1c759..c0070eead40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaKernelLauncher.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaKernelLauncher.h index 26e9d2d5122..b74d13476d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaKernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/CudaKernelLauncher.h @@ -33,10 +33,70 @@ namespace gen //////////////////////////////////////////////////////////////////////////////////////////////////// #ifdef TLLM_ENABLE_CUDA +inline CUresult launchKernelFlexibleCgaSizes(void* kernelParams, void* cudaStream, int32_t smemSize, CUfunction kernel, + dim3 block3, dim3 grid3, dim3 cluster3, dim3 fallbackCluster3, bool enablesPdl) +{ + // Make sure we can launch with that much shared memory. + // Note: those function-level settings are actually ignored as we use per-launch attributes. + if (smemSize > 48 * 1024) + { + CUresult result; + result = cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smemSize); + if (result != CUDA_SUCCESS) + { + return result; + } + } + + auto clusterDim = cluster3.x * cluster3.y * cluster3.z; + + CUlaunchConfig launchConfig; + launchConfig.blockDimX = block3.x; + launchConfig.blockDimY = block3.y; + launchConfig.blockDimZ = block3.z; + launchConfig.gridDimX = grid3.x; + launchConfig.gridDimY = grid3.y; + launchConfig.gridDimZ = grid3.z; + launchConfig.hStream = reinterpret_cast(cudaStream); + launchConfig.sharedMemBytes = smemSize; + + CUlaunchAttribute launchAttrs[4]; + launchAttrs[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + launchAttrs[0].value.clusterDim.x = fallbackCluster3.x; + launchAttrs[0].value.clusterDim.y = fallbackCluster3.y; + launchAttrs[0].value.clusterDim.z = fallbackCluster3.z; + launchAttrs[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; + launchAttrs[1].value.clusterSchedulingPolicyPreference + = (clusterDim > 1) ? CU_CLUSTER_SCHEDULING_POLICY_SPREAD : CU_CLUSTER_SCHEDULING_POLICY_DEFAULT; + launchAttrs[2].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; + launchAttrs[2].value.programmaticStreamSerializationAllowed = enablesPdl; + launchAttrs[3].id = CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION; + launchAttrs[3].value.preferredClusterDim.x = cluster3.x; + launchAttrs[3].value.preferredClusterDim.y = cluster3.y; + launchAttrs[3].value.preferredClusterDim.z = cluster3.z; + launchConfig.attrs = launchAttrs; + launchConfig.numAttrs = 4; + + // Add setting for non-portable cluster size. + { + CUresult result = cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, + 1 // Enable non-portable cluster sizes + ); + if (result != CUDA_SUCCESS) + { + return result; + } + } + + // Launch the kernel. + return cuLaunchKernelEx(&launchConfig, kernel, &kernelParams, nullptr); +} + inline CUresult launchKernel(void* kernelParams, void* cudaStream, int32_t smemSize, CUfunction kernel, dim3 block3, dim3 grid3, dim3 cluster3, bool enablesPdl) { // Make sure we can launch with that much shared memory. + // Note: those function-level settings are actually ignored as we use per-launch attributes. if (smemSize > 48 * 1024) { CUresult result; @@ -69,8 +129,8 @@ inline CUresult launchKernel(void* kernelParams, void* cudaStream, int32_t smemS = (clusterDim > 1) ? CU_CLUSTER_SCHEDULING_POLICY_SPREAD : CU_CLUSTER_SCHEDULING_POLICY_DEFAULT; launchAttrs[2].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; launchAttrs[2].value.programmaticStreamSerializationAllowed = enablesPdl; - launchConfig.attrs = launchAttrs; launchConfig.numAttrs = 3; + launchConfig.attrs = launchAttrs; // Add setting for non-portable cluster size. if (clusterDim > 8) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h index 7b136dad2e7..5677e1496ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h @@ -93,10 +93,11 @@ inline std::string mmaKindToString(MmaKind mmaKind) //////////////////////////////////////////////////////////////////////////////////////////////////// -// Get the TMEM column stride per group (i.e. kGroupSize * blockSize K elements) -inline int32_t getTmemColStridePerGroup(int32_t tileMn, int32_t mmaK, int32_t kGroupSize) +// Get the TMEM column stride per group. +// A group is one or more MMA instructions that share the same TMEM columns. +inline int32_t getTmemColStridePerGroup(int32_t mmaMn, int32_t mmaK, [[maybe_unused]] int32_t kGroupSize) { - int32_t colStride = 2 * ceilDiv(tileMn, 64); + int32_t colStride = 2 * ceilDiv(mmaMn, 64); if (mmaK == 96) { colStride = std::max(4, colStride); @@ -106,6 +107,8 @@ inline int32_t getTmemColStridePerGroup(int32_t tileMn, int32_t mmaK, int32_t kG //////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gen } // namespace trtllm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h index 72d0e1a259a..98591b0b502 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h @@ -70,6 +70,7 @@ enum class SfLayout // I.e., the SF buffer is a tensor [⌈m/128⌉, ⌈n/b/4⌉, 32, 4, 4] // The SF for the element (i, j) is stored at (i/128, j/b/4, i%32, (i%128)/32, (j/b)%4). R128c4, + }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -88,6 +89,13 @@ inline std::string sfLayoutToString(SfLayout layout) //////////////////////////////////////////////////////////////////////////////////////////////////// +inline bool sfLayoutCanUseUtccp(SfLayout layout) +{ + return (layout == SfLayout::R128c4); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gen } // namespace trtllm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu index d750cd8f41e..fcc12ceab7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu @@ -240,11 +240,18 @@ tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions( } else { + EltwiseActType eltwiseActType = EltwiseActType::None; + switch (actType) + { + default: + case ActType::Relu2: eltwiseActType = EltwiseActType::Relu2; break; + case ActType::Silu: eltwiseActType = EltwiseActType::Silu; break; + } options = { .dtypeA = dtypeWeights, .dtypeB = dtypeAct, .dtypeC = dtypeAct, - .eltwiseActType = EltwiseActType::Relu2, + .eltwiseActType = eltwiseActType, .deepSeekFp8 = useDeepSeekFp8, .fusedAct = false, .routeAct = true, diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 99d6ee68766..87aae7a5c64 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -237,11 +237,11 @@ def createKubernetesPodConfig(image, type, arch = "amd64") resources: requests: cpu: '2' - memory: 10Gi + memory: 20Gi ephemeral-storage: 25Gi limits: cpu: '2' - memory: 10Gi + memory: 20Gi ephemeral-storage: 25Gi imagePullPolicy: Always""" nodeLabelPrefix = "cpu" diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py index cc67beae4d5..f4f884f2a89 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py @@ -236,11 +236,11 @@ def trtllm_moe_fused( ) else: # For non-gated MLP with ReLU^2 - if act_fn == ActivationType.Relu2: - activation_type = ActivationType.Relu2 + if act_fn in [ActivationType.Relu2, ActivationType.Silu]: + activation_type = act_fn else: raise ValueError( - f"Unsupported activation '{ActivationType(act_fn).name}' for mlp. Use 'relu2'." + f"Unsupported activation '{ActivationType(act_fn).name}' for mlp. Use 'relu2' or 'silu'." ) mapping, enable_alltoall = _check_moe_alltoall(mapping_config, max_num_tokens) @@ -293,10 +293,10 @@ def trtllm_moe_fused_fake( def _validate_mlp_style_and_act_fn(is_gated_mlp: bool, act_fn: int) -> None: assert (is_gated_mlp and act_fn in [ActivationType.Silu, ActivationType.Swiglu]) or ( - not is_gated_mlp and act_fn == ActivationType.Relu2 + not is_gated_mlp and act_fn in [ActivationType.Relu2, ActivationType.Silu] ), ( f"Unsupported combination: is_gated_mlp='{is_gated_mlp}', act_fn='{act_fn}'. " - f"Supported combinations: gated mlp with silu or mlp with relu2." + f"Supported combinations: gated mlp with silu or mlp with relu2 or silu." ) @@ -340,7 +340,7 @@ def trtllm_quant_fp8_moe_fused( fc2_act_scale_reciprocal: FC2 activation scale reciprocal (scalar) fc2_dequant_scale: FC2 dequant scale [E] is_gated_mlp: True for gated_mlp, False for mlp - act_fn: ActivationType.Silu for gated_mlp, ActivationType.Relu2 for mlp + act_fn: ActivationType.Silu for gated_mlp, ActivationType.Relu2 or ActivationType.Silu for mlp Returns: Output tensor of shape (B, H) or (B, S, H) @@ -481,7 +481,7 @@ def trtllm_quant_nvfp4_moe_fused( fc1_alpha: FC1 dequant scales = 1.0 / (fc1_act_global_scale * fc1_weight_global_scale) fc2_alpha: FC2 dequant scales = 1.0 / (fc2_act_global_scale * fc2_weight_global_scale) mlp_style: "gated_mlp" or "mlp" - act_fn: "silu" for gated_mlp, "relu2" for mlp + act_fn: "silu" for gated_mlp, "relu2" or "silu" for mlp """ # Validate block scale tensors are 3D (padding requirements handled below) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py index 8026a7799b4..af7f4aab50b 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py @@ -283,6 +283,8 @@ def _to_trtllm_gen_activation_type(self, return 0 elif activation_type == ActivationType.Relu2: return 1 + elif activation_type == ActivationType.Silu: + return 2 else: raise ValueError(f"Unsupported activation type: {activation_type}") @@ -340,8 +342,9 @@ def _get_quant_method(self): return DeepSeekFP8BlockScalesFusedMoEMethod() elif self.quant_config.layer_quant_mode.has_nvfp4(): return NVFP4TRTLLMGenFusedMoEMethod( - ) if self.swiglu_alpha is not None or self.activation_type == ActivationType.Relu2 else NVFP4TRTLLMGenFusedMoEBaseMethod( - ) + ) if self.swiglu_alpha is not None or self.activation_type in [ + ActivationType.Relu2, ActivationType.Silu + ] else NVFP4TRTLLMGenFusedMoEBaseMethod() elif self.quant_config.layer_quant_mode.has_w4a16_mxfp4(): return W4A16MXFP4TRTLLMGenFusedMoEMethod() elif self.quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8(): @@ -570,7 +573,9 @@ def run_moe( topk_ids=token_selected_experts, ) elif self.has_nvfp4: - factor = 1 if self.activation_type == ActivationType.Relu2 else 2 + factor = 1 if self.activation_type in [ + ActivationType.Relu2, ActivationType.Silu + ] else 2 intermediate_size_per_partition_padded = self.w3_w1_weight.shape[ -2] // factor act_type = self._to_trtllm_gen_activation_type(self.activation_type) diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py index 49c00f8c752..eb62a50016f 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py +++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py @@ -2826,10 +2826,11 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict): # last step: load fc31_scale_c # c_global_sf: fc2_input_scale # For gated activations (SwiGlu), scale_c_fc1 includes both input and weight scales - # For non-gated activations (Relu2), scale_c_fc1 is just the input scale + # For non-gated activations (Relu2 or Silu), scale_c_fc1 is just the input scale from ...utils import ActivationType - if hasattr(module, 'activation_type' - ) and module.activation_type == ActivationType.Relu2: + if hasattr(module, 'activation_type') and module.activation_type in [ + ActivationType.Relu2, ActivationType.Silu + ]: # For Relu2: scale_c_fc1 = fc2_input_scale (broadcast to all experts) module.fc31_scale_c.data.copy_(module.fc2_input_scale.data.expand( module.expert_size_per_partition), diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py index 3c243346bb8..64387894d0b 100644 --- a/tensorrt_llm/_torch/utils.py +++ b/tensorrt_llm/_torch/utils.py @@ -54,6 +54,7 @@ class ActivationType(IntEnum): class ActType_TrtllmGen(IntEnum): SwiGlu = 0 Relu2 = 1 + Silu = 2 # IMPORTANT: when adding a new activation type, please update this function. diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index f84c0534914..0373bfeffdc 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -1477,7 +1477,8 @@ async def openai_video_generation_sync( actual_output_path = MediaStorage.save_video( video=output.video, - output_path=self.media_storage_path / f"{video_id}{resolved_ext}", + output_path=self.media_storage_path / + f"{video_id}{resolved_ext}", audio=output.audio, frame_rate=request.fps or params.frame_rate, format=resolved_fmt, @@ -1646,7 +1647,8 @@ async def _generate_video_background( actual_output_path = MediaStorage.save_video( video=output.video, - output_path=self.media_storage_path / f"{video_id}{resolved_ext}", + output_path=self.media_storage_path / + f"{video_id}{resolved_ext}", audio=output.audio, frame_rate=request.fps or params.frame_rate, format=resolved_fmt, diff --git a/tests/unittest/_torch/thop/serial/test_moe.py b/tests/unittest/_torch/thop/serial/test_moe.py index b28e695ecec..b783b32e82d 100644 --- a/tests/unittest/_torch/thop/serial/test_moe.py +++ b/tests/unittest/_torch/thop/serial/test_moe.py @@ -37,6 +37,7 @@ class ActType(Enum): SwiGlu = 0 Relu2 = 1 + Silu = 2 class moe_args: @@ -422,6 +423,8 @@ def run_moe_dequant(args, activation_output[i:i + my_num_tokens] = act * (beta + my_x1) elif args.act_type == ActType.Relu2: activation_output[i:i + my_num_tokens] = F.relu(my_x1)**2 + elif args.act_type == ActType.Silu: + activation_output[i:i + my_num_tokens] = F.silu(my_x1) i += my_num_tokens i = (i + args.padding - 1) // args.padding * args.padding @@ -1021,8 +1024,9 @@ class TestMoeFp4: @pytest.mark.parametrize("num_tokens", [1, 1024]) @pytest.mark.parametrize("hidden_size", [1024]) @pytest.mark.parametrize("intermediate_size", [1024, 768]) - @pytest.mark.parametrize("act_type", [ActType.SwiGlu, ActType.Relu2], - ids=["swiglu", "relu2"]) + @pytest.mark.parametrize("act_type", + [ActType.SwiGlu, ActType.Relu2, ActType.Silu], + ids=["swiglu", "relu2", "silu"]) @pytest.mark.parametrize( "routing_info", [ @@ -1137,8 +1141,9 @@ def test_autotune_fp8_fp4(self, num_tokens, hidden_size, intermediate_size, @pytest.mark.parametrize("num_tokens", [1, 150]) @pytest.mark.parametrize("hidden_size", [1024]) @pytest.mark.parametrize("intermediate_size", [1024]) - @pytest.mark.parametrize("act_type", [ActType.SwiGlu, ActType.Relu2], - ids=["swiglu", "relu2"]) + @pytest.mark.parametrize("act_type", + [ActType.SwiGlu, ActType.Relu2, ActType.Silu], + ids=["swiglu", "relu2", "silu"]) @pytest.mark.parametrize( "routing_info", [ @@ -1601,7 +1606,7 @@ def run_moe_fp4_test(self, scale_c_fc1 = args_dequant.c_global_sf * ( 1.0 / args.gemm1_scales_global) * ( 1.0 / args.hidden_states_scale_global) - elif act_type == ActType.Relu2: + elif act_type in [ActType.Relu2, ActType.Silu]: scale_c_fc1 = torch.full_like(args.gemm1_scales_global, args_dequant.c_global_sf) # self.fc31_alpha @@ -1651,7 +1656,7 @@ def run_moe_fp4_test(self, do_finalize=True, topk_ids=topk_ids, topk_weights=topk_weights, - act_type=1 if act_type == ActType.Relu2 else 0) + act_type=act_type.value) torch.cuda.synchronize() output_dequant_actual = output[0].to(torch.float) @@ -1662,7 +1667,7 @@ def run_moe_fp4_test(self, else: atol = 0.1 rtol = 0.85 - percent = 0.925 + percent = 0.9 check_accuracy(output_dequant_reference, output_dequant_actual,