Sync trtllm FMHA: mSparseMla -> mSparseAttn for new cubin struct

PerkzZheng · claude · PerkzZheng · commit 6c56ac8685c7 · 2026-04-20T02:39:24.000Z
Minimal header changes to match the new trtllm-gen FMHA cubin MetaInfo
struct layout:

- TllmGenFmhaKernelMetaInfo: renamed mSparseMla (bool) -&gt; mSparseAttn
  (int). Callers convert to bool via `!= 0`.
- KernelParams (GPU-side struct): renamed mSparseMlaTopK -&gt; mSparseAttnTopK
  and moved immediately after mSkipSoftmaxThresholdScaleFactor to match
  the layout expected by the new kernels.

The K/V dtype split (mDataTypeKv -&gt; mDataTypeK/V) and SageAttention block
size fields present in the new struct are layout-compatible but not used,
so no code changes are needed for those -- existing references to
mDataTypeKv still compile since the cubin-supplied struct keeps that
field alongside the new mDataTypeK/V.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/csrc/fmhaReduction.cu b/csrc/fmhaReduction.cu
@@ -81,7 +81,7 @@ __global__ void __launch_bounds__(NumThreadsPerCta, 2)
   seqLenKv = seqLenKv - ((params.mMaxSeqLenQ - 1) - ctaIdxQ);
   // Consider sparseMlaTopK.
   if (sparseMla) {
-    seqLenKv = min(seqLenKv, params.mSparseMlaTopK);
+    seqLenKv = min(seqLenKv, params.mSparseAttnTopK);
   }
   // The actual number of CtasKv (TileSizeKv is always 128 for now).
   int32_t numCtasKv{min((seqLenKv + 127) / 128, params.mMaxNumCtasKv)};
@@ -361,7 +361,7 @@ void runFmhaReduction(TllmGenFmhaKernelMetaInfo const& kernelMeta, KernelParams
   }
 
   // Launch the kernel.
-  cudaLaunchKernelEx(&config, kernel, params, kernelMeta.mSparseMla, numCtasForReduction,
+  cudaLaunchKernelEx(&config, kernel, params, kernelMeta.mSparseAttn != 0, numCtasForReduction,
                      numCtasForAllHeads, numHeadDimCtasV);
   cudaError_t err = cudaGetLastError();
   FLASHINFER_CHECK(err == cudaSuccess, "Failed to launch kernel: ", cudaGetErrorString(err));
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -191,7 +191,7 @@ class TllmGenFmhaKernel {
                   kernelMeta.mTileScheduler, kernelMeta.mMultiCtasKvMode,
                   kernelMeta.mHeadDimPerCtaV, kernelMeta.mHeadDimQk, kernelMeta.mHeadDimV,
                   kernelMeta.mTileSizeQ, kernelMeta.mTileSizeKv, kernelMeta.mNumTokensPerPage,
-                  kernelMeta.mReuseSmemKForV, kernelMeta.m2CtaMma, kernelMeta.mSparseMla,
+                  kernelMeta.mReuseSmemKForV, kernelMeta.m2CtaMma, kernelMeta.mSparseAttn != 0,
                   kernelMeta.mSkipsSoftmaxWhenPossible);
   }
 
diff --git a/include/flashinfer/trtllm/fmha/kernelParams.h b/include/flashinfer/trtllm/fmha/kernelParams.h
@@ -194,13 +194,14 @@ struct KernelParams {
   float mScaleSfO;
   // Threshold to decide whether warp skips softmax ops
   float mSkipSoftmaxThresholdScaleFactor;
+  // The sparse attention topK value. Must immediately follow mSkipSoftmaxThresholdScaleFactor
+  // to match the GPU struct layout expected by trtllm-gen kernels.
+  int32_t mSparseAttnTopK;
   // The start token index in SF tensor. Used for FP4 SF offset calculation in generation phase
   // kernel when inflight batching is enabled in TRT-LLM.
   int32_t mStartTokenIdxSfO;
   // The sum of sequence lengths for Q and K/V.
   int32_t mSumOfSeqLensQ, mSumOfSeqLensKv;
-  // The sparseMla topK value.
-  int32_t mSparseMlaTopK;
   // The flag to use block sparse attention.
   bool mUseBlockSparseAttention;
   // Whether the indices for K & V pages are shared as unified index.
@@ -879,7 +880,7 @@ struct KernelParams {
     // indices.
     FLASHINFER_CHECK(!options.mSparseMla || (options.mSparseMlaTopK % 4) == 0,
                      "SparseMlaTopK must be a multiple of 4");
-    params.mSparseMlaTopK = options.mSparseMlaTopK;
+    params.mSparseAttnTopK = options.mSparseMlaTopK;
     // TODO: Integrate trtllm block-sparse attention kernels when needed.
     params.mUseBlockSparseAttention = false;
     // Whether the indices for K & V pages are shared as unified index (vLLM/FlashInfer).

Original file line number	Diff line number	Diff line change
`@@ -191,7 +191,7 @@ class TllmGenFmhaKernel {`
`191`	`191`	`kernelMeta.mTileScheduler, kernelMeta.mMultiCtasKvMode,`
`192`	`192`	`kernelMeta.mHeadDimPerCtaV, kernelMeta.mHeadDimQk, kernelMeta.mHeadDimV,`
`193`	`193`	`kernelMeta.mTileSizeQ, kernelMeta.mTileSizeKv, kernelMeta.mNumTokensPerPage,`
`194`		`- kernelMeta.mReuseSmemKForV, kernelMeta.m2CtaMma, kernelMeta.mSparseMla,`
	`194`	`+ kernelMeta.mReuseSmemKForV, kernelMeta.m2CtaMma, kernelMeta.mSparseAttn != 0,`
`195`	`195`	`kernelMeta.mSkipsSoftmaxWhenPossible);`
`196`	`196`	`}`
`197`	`197`