[https://nvbugs/5983390][perf] Kernel fusions in _gather_k_cache_for_chunk of Indexer in DSA (NVIDIA#12322)

hyukn · web-flow · commit 6601758d3a8a · 2026-03-21T13:53:27.000+08:00
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -32,7 +32,8 @@
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
-from .kernel import triton_convert_req_index_to_global_index
+from .kernel import (triton_convert_req_index_to_global_index,
+                     triton_gather_k_cache)
 
 ModelConfig = tensorrt_llm.bindings.ModelConfig
 
@@ -94,24 +95,6 @@ def _compute_slot_mappings(
     return fp8_indices, scale_indices
 
 
-def _unravel_indices(flat_indices: torch.Tensor,
-                     shape: Tuple[int, ...]) -> Tuple[torch.Tensor, ...]:
-    """
-    Unravel indices into multiple dimensions.
-    """
-    d3 = shape[3]
-    i3 = flat_indices % d3
-    flat_indices = flat_indices // d3
-    d2 = shape[2]
-    i2 = flat_indices % d2
-    flat_indices = flat_indices // d2
-    d1 = shape[1]
-    i1 = flat_indices % d1
-    flat_indices = flat_indices // d1
-    i0 = flat_indices
-    return i0, i1, i2, i3
-
-
 def rotate_activation(x: torch.Tensor) -> torch.Tensor:
     assert x.dtype == torch.bfloat16
 
@@ -1402,68 +1385,6 @@ def _update_k_cache(self, k_fp8: torch.Tensor, k_scale: torch.Tensor,
                                                     k_cache, flat_indices_fp8,
                                                     flat_indices_scale)
 
-    def _gather_k_cache_for_chunk(
-        self,
-        metadata: DSAtrtllmAttentionMetadata,
-        chunk: IndexerPrefillChunkMetadata,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Gather K values from indexer cache for a specific chunk.
-
-        Uses pre-computed extended slot mappings that cover cached + current batch context tokens.
-        chunk.k_token_start/k_token_end directly index into the extended slot mapping.
-
-        Args:
-            metadata: Attention metadata
-            chunk: Chunk metadata with k_token_start/end as indices into extended slot mapping
-
-        Returns:
-            k_fp8: FP8 quantized k tensor, shape [num_k_tokens, head_dim]
-            k_scale: Scaling factors, shape [num_k_tokens, 1]
-        """
-        assert metadata.slot_mapping_fp8_fullkv is not None, \
-            "_gather_k_cache_for_chunk requires extended slot mappings (only available with cached tokens)"
-
-        k_cache = metadata.kv_cache_manager.get_indexer_k_cache_buffers(
-            self.layer_idx)
-
-        head_dim = self.head_dim
-        scale_size = 4  # float32 = 4 bytes
-
-        # Extract slot mappings using chunk's k_token_start/end
-        # These indices point directly into the extended slot mapping array
-        k_token_start = chunk.k_token_start
-        k_token_end = chunk.k_token_end
-        num_k_tokens = k_token_end - k_token_start
-
-        slot_mapping_fp8_chunk = metadata.slot_mapping_fp8_fullkv[
-            k_token_start:k_token_end]
-        slot_mapping_scale_chunk = metadata.slot_mapping_scale_fullkv[
-            k_token_start:k_token_end]
-
-        # Vectorized gather using pre-computed slot mappings
-        # Gather FP8 data
-        byte_offsets_fp8 = torch.arange(
-            head_dim, device=k_cache.device).unsqueeze(0)  # [1, head_dim]
-        gather_indices_fp8 = slot_mapping_fp8_chunk.unsqueeze(
-            1) + byte_offsets_fp8  # [num_k_tokens, head_dim]
-        gather_indices_fp8 = _unravel_indices(gather_indices_fp8, k_cache.shape)
-        k_fp8_bytes = k_cache[gather_indices_fp8]
-        k_fp8 = k_fp8_bytes.view(torch.float8_e4m3fn).view(
-            num_k_tokens, head_dim)
-
-        # Gather scale data
-        byte_offsets_scale = torch.arange(
-            scale_size, device=k_cache.device).unsqueeze(0)  # [1, 4]
-        gather_indices_scale = slot_mapping_scale_chunk.unsqueeze(
-            1) + byte_offsets_scale  # [num_k_tokens, 4]
-        gather_indices_scale = _unravel_indices(gather_indices_scale,
-                                                k_cache.shape)
-        k_scale_bytes = k_cache[gather_indices_scale]
-        k_scale = k_scale_bytes.view(torch.float32).view(num_k_tokens, 1)
-
-        return k_fp8, k_scale
-
     def sparse_attn_indexer(
         self,
         metadata: DSAtrtllmAttentionMetadata,
@@ -1502,10 +1423,23 @@ def sparse_attn_indexer(
                     tp_rank = metadata.mapping.tp_rank
                     tp_size = metadata.mapping.tp_size
 
+                # Use the 2D pool data directly (contiguous) instead of the
+                # 4D view, because the 4D view may have strides that
+                # prevent flattening via .view(-1).
+                layer_offset = metadata.kv_cache_manager.layer_offsets[
+                    self.layer_idx]
+                gather_k_cache_pool = metadata.kv_cache_manager.indexer_k_cache_pool_per_layer[
+                    layer_offset]
+
                 for chunk in metadata.indexer_prefill_chunks:
-                    # Gather K from cache for this chunk (dual to _update_k_cache)
-                    chunk_k_fp8, chunk_k_scale = self._gather_k_cache_for_chunk(
-                        metadata, chunk)
+                    chunk_k_fp8, chunk_k_scale = triton_gather_k_cache(
+                        gather_k_cache_pool,
+                        metadata.slot_mapping_fp8_fullkv,
+                        metadata.slot_mapping_scale_fullkv,
+                        chunk.k_token_start,
+                        chunk.k_token_end,
+                        self.head_dim,
+                    )
 
                     chunk_num_token = chunk.token_end - chunk.token_start
                     apply_q_split = q_split_eligible and chunk_num_token >= q_split_threshold
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/kernel.py b/tensorrt_llm/_torch/attention_backend/sparse/kernel.py
@@ -1936,3 +1936,121 @@ def triton_convert_req_index_to_global_index(
         out_stride1,
     )
     return out
+
+
+########################################################
+# Fused K cache gather kernel
+########################################################
+
+
+@triton.jit
+def _triton_gather_k_cache_kernel(
+    k_cache_ptr,
+    slot_fp8_ptr,
+    slot_scale_ptr,
+    out_fp8_ptr,
+    out_scale_ptr,
+    k_token_start,
+    num_k_tokens,
+    HEAD_DIM: tl.constexpr,
+    SCALE_BYTES: tl.constexpr,
+    BLOCK_TOKENS: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    token_offsets = (pid * BLOCK_TOKENS + tl.arange(0, BLOCK_TOKENS)).to(
+        tl.int64)
+    token_mask = token_offsets < num_k_tokens
+
+    fp8_base = tl.load(slot_fp8_ptr + k_token_start + token_offsets,
+                       mask=token_mask,
+                       other=0)
+    scale_base = tl.load(slot_scale_ptr + k_token_start + token_offsets,
+                         mask=token_mask,
+                         other=0)
+
+    byte_offsets = tl.arange(0, HEAD_DIM).to(tl.int64)
+    src_fp8 = fp8_base[:, None] + byte_offsets[None, :]
+    dst_fp8 = token_offsets[:, None] * HEAD_DIM + byte_offsets[None, :]
+    gather_mask = token_mask[:, None]
+
+    fp8_data = tl.load(k_cache_ptr + src_fp8, mask=gather_mask, other=0)
+    tl.store(out_fp8_ptr + dst_fp8, fp8_data, mask=gather_mask)
+
+    scale_byte_offsets = tl.arange(0, SCALE_BYTES).to(tl.int64)
+    src_scale = scale_base[:, None] + scale_byte_offsets[None, :]
+    dst_scale = token_offsets[:,
+                              None] * SCALE_BYTES + scale_byte_offsets[None, :]
+
+    scale_data = tl.load(k_cache_ptr + src_scale, mask=gather_mask, other=0)
+    tl.store(out_scale_ptr + dst_scale, scale_data, mask=gather_mask)
+
+
+def triton_gather_k_cache(
+    k_cache: torch.Tensor,
+    slot_mapping_fp8: torch.Tensor,
+    slot_mapping_scale: torch.Tensor,
+    k_token_start: int,
+    k_token_end: int,
+    head_dim: int,
+):
+    """Gather K FP8 values and scales from the indexer K cache for a chunk.
+
+    Replaces ``_gather_k_cache_for_chunk``, fusing ~8-12 small PyTorch ops
+    (arange, unsqueeze, broadcast add, _unravel_indices, advanced indexing)
+    into a single Triton kernel that directly gathers from flat byte offsets.
+    This is purely data movement — bit-exact with the original.
+
+    Args:
+        k_cache: Indexer K cache pool data (2D contiguous), uint8.
+        slot_mapping_fp8: Flat byte indices for FP8 data
+            ``[total_kv_len]``, int64.
+        slot_mapping_scale: Flat byte indices for scale data
+            ``[total_kv_len]``, int64.
+        k_token_start: Start index into slot mapping arrays.
+        k_token_end: End index into slot mapping arrays.
+        head_dim: FP8 head dimension (typically 128).
+
+    Returns:
+        Tuple of (k_fp8, k_scale):
+            k_fp8: ``[num_k_tokens, head_dim]``, float8_e4m3fn.
+            k_scale: ``[num_k_tokens, 1]``, float32.
+    """
+    num_k_tokens = k_token_end - k_token_start
+    device = k_cache.device
+
+    if num_k_tokens == 0:
+        return (
+            torch.empty(0, head_dim, dtype=torch.float8_e4m3fn, device=device),
+            torch.empty(0, 1, dtype=torch.float32, device=device),
+        )
+
+    SCALE_BYTES = 4
+    BLOCK_TOKENS = 32
+
+    k_cache_flat = k_cache.reshape(-1)
+    out_fp8 = torch.empty(num_k_tokens,
+                          head_dim,
+                          dtype=torch.uint8,
+                          device=device)
+    out_scale = torch.empty(num_k_tokens,
+                            SCALE_BYTES,
+                            dtype=torch.uint8,
+                            device=device)
+
+    grid = (triton.cdiv(num_k_tokens, BLOCK_TOKENS), )
+    _triton_gather_k_cache_kernel[grid](
+        k_cache_flat,
+        slot_mapping_fp8,
+        slot_mapping_scale,
+        out_fp8.view(-1),
+        out_scale.view(-1),
+        k_token_start,
+        num_k_tokens,
+        HEAD_DIM=head_dim,
+        SCALE_BYTES=SCALE_BYTES,
+        BLOCK_TOKENS=BLOCK_TOKENS,
+    )
+
+    k_fp8 = out_fp8.view(torch.float8_e4m3fn)
+    k_scale = out_scale.view(torch.float32).view(num_k_tokens, 1)
+    return k_fp8, k_scale
diff --git a/tests/unittest/_torch/attention/sparse/test_triton_gather_k_cache.py b/tests/unittest/_torch/attention/sparse/test_triton_gather_k_cache.py