[Fmha] update trtllm-gen FMHA cubins and sync headers for context SWA fix (#3089)

PerkzZheng · claude · web-flow · commit b809821333b5 · 2026-04-23T10:14:38.000-07:00
## 📌 Description The branch has 2 commits: 1. Update trtllm-gen FMHA cubins to fix context SWA page-skip — updates artifacts.py path + checksum 2. Sync trtllm FMHA headers with latest trtllm-gen (from PR #2711) — cherry-picks header changes to match the new cubin MetaInfo struct ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Bug Fixes** * Fixed sparse-attention truncation so sequence-length top-K is applied correctly when sparse-attention is enabled. * **Improvements** * Standardized sparse-attention parameter naming and selection logic to make behavior more consistent across launches and kernel choices. * Skip incompatible kernel variants during runtime kernel loading to avoid incorrect selections. * **Chores** * Updated FMHA runtime artifact paths and their checksums for validation and downloads.  --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -135,7 +135,7 @@ class ArtifactPath:
     When compiling new cubins for backend directories, update the corresponding path.
     """
 
-    TRTLLM_GEN_FMHA: str = "82f4c77d9cf83e3fcf105feda4ce3445100ab491/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "134850621dbbd55ed6b0c3fa7c29b968136c05ef/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
         "39a9d28268f43475a757d5700af135e1e58c9849/batched_gemm-5ee61af-2b9855b/"
     )
@@ -155,7 +155,7 @@ class CheckSumHash:
     """
 
     TRTLLM_GEN_FMHA: str = (
-        "56c95fbe5d1b5d0d9ded7706e1c0b7ebf0582d9cfd2f9382acd878b6b9d58c89"
+        "2be32ce1949ab0b1e637c27f128b77c41d6753a36cb9c0e1a97acb2d3d44ae5f"
     )
     TRTLLM_GEN_BMM: str = (
         "db06db7f36a2a9395a2041ff6ac016fe664874074413a2ed90797f91ef17e0f6"
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -20,6 +20,7 @@
 
 #include <cfloat>
 #include <cstdint>
+#include <cstring>
 #include <cuda/std/cfloat>
 #include <iterator>
 #include <memory>
@@ -112,6 +113,13 @@ class TllmGenFmhaKernel {
     for (unsigned int i = 0; i < mKernelMetaCount; ++i) {
       auto const& kernelMeta = mKernelMeta[i];
       IKL_LOG_DEBUG("Checking tllmgen attention kernel %s", kernelMeta.mFuncName);
+      // Skip SageAttention kernels: they share the same hashID as their non-sage
+      // counterparts (sage block sizes are not part of the hash), which causes
+      // false "hash conflict" failures. SageAttention is not exposed through the
+      // flashinfer interface, so dropping these entries is safe.
+      if (kernelMeta.mFuncName != nullptr && std::strstr(kernelMeta.mFuncName, "Sage") != nullptr) {
+        continue;
+      }
       if (isSMCompatible(mSM, kernelMeta.mSM) && kernelMeta.mDataTypeQ == mDtypeQ &&
           kernelMeta.mDataTypeKv == mDtypeKv && kernelMeta.mDataTypeO == mDtypeOut) {
         // Store metadata for later use.
@@ -443,13 +451,15 @@ class TllmGenFmhaKernel {
 
       // Enable the CgaSmemReduction if the numCtasPerSeqKv <= 16 as the maximum cluster dimension
       // is 16. Only the swapsMmaAbForGeneration kernel supports the CgaSmemReduction for now.
-      // CgaSmemReduction exceeds the shared memory limit for MLA decode with tileSizeQ >= 32
-      // (headDimQk=576 requires more smem than the device allows for that tile size).
+      // headDimV >= 512 is excluded: the current trtllm-gen cubin ships no SwapsMmaAb
+      // CgaSmemReduction kernels at headDimV >= 512 (covers both MLA headDimQk=576/V=512 and
+      // non-MLA H=512), and for tileSizeQ >= 32 the CGA variant also exceeds the device smem
+      // limit. This guard can be narrowed once trtllm-gen ships a cubin with the
+      // tileSizeQ>=32 + headDimPerCtaV>=512 skip predicate.
       if (!isDsv3MinLatencyMode && numCtasPerSeqKv > 1 && numCtasPerSeqKv <= 16 &&
           isSwapsMmaAbForGenerationKernel(selectKernelParams.mKernelType) &&
           isGmemReduction(selectKernelParams.mMultiCtasKvMode) &&
-          !selectKernelParams.mForceGmemReduction &&
-          (!isMlaGenKernel(params) || selectKernelParams.mTileSizeQ < 32)) {
+          !selectKernelParams.mForceGmemReduction && params.mHeadDimV < 512) {
         selectKernelParams.mMultiCtasKvMode = MultiCtasKvMode::CgaSmemReduction;
         // Need to select a different kernel.
         selectKernelParams.mSelectNewKernel = true;
@@ -864,12 +874,13 @@ class TllmGenFmhaKernel {
     // Hash the runner params.
     auto [hashId, info] = hashFromRunnerParams(params, selectKernelParams);
     auto const findMetaIter = mKernelMetaMap.find(hashId);
-    // The meta index.
-    auto const metaIndex = findMetaIter->second;
 
     // Add debug info when kernels are not found.
     FLASHINFER_CHECK(findMetaIter != mKernelMetaMap.end(), "Trtllm-gen kernels not found: " + info);
 
+    // The meta index.
+    auto const metaIndex = findMetaIter->second;
+
     // Load the function if not found.
     if (mFunctions.find(hashId) == mFunctions.end()) {
       // Load the kernel on-demand.