sgl-project
diff --git a/‎docs_new/docs/advanced_features/server_arguments.mdx‎
Lines changed: 6 additions & 0 deletions b/‎docs_new/docs/advanced_features/server_arguments.mdx‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎python/sglang/srt/layers/attention/linear/gdn_backend.py‎
Lines changed: 34 additions & 4 deletions b/‎python/sglang/srt/layers/attention/linear/gdn_backend.py‎
Lines changed: 34 additions & 4 deletions
diff --git a/‎python/sglang/srt/layers/attention/mamba/mamba_state_scatter_triton.py‎
Lines changed: 7 additions & 3 deletions b/‎python/sglang/srt/layers/attention/mamba/mamba_state_scatter_triton.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎python/sglang/srt/layers/attention/triton_backend.py‎
Lines changed: 9 additions & 0 deletions b/‎python/sglang/srt/layers/attention/triton_backend.py‎
Lines changed: 9 additions & 0 deletions
@@ -509,6 +509,12 @@ Please consult the documentation below and [server_args.py](https://github.com/s
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>The number of tokens in a page.</td>
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>`1`</td>
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Type: int</td>
+    </tr>
+    <tr>
+      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--enable-page-major-kv-layout`</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Enable the page-major KV layout: lay out the Mamba state and full/SWA KV caches in a page-granularity envelope (page is the outermost axis, layer-major within a page) instead of the default per-layer (layer-major) layout. Requires the Triton attention / linear-attn / Mamba backends (`--attention-backend triton`, and for hybrid models `--linear-attn-backend triton --mamba-backend triton`).</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>`False`</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>bool flag (set to enable)</td>
     </tr>
         <tr>
       <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--swa-full-tokens-ratio`</td>
 
@@ -425,6 +425,30 @@ def forward_extend(
         else:
             has_initial_states = forward_batch.extend_prefix_lens > 0
 
+        # Page-major envelope: the prefill kernels (CUDA causal_conv1d_fwd,
+        # chunk_gated_delta_rule) write state back in place assuming a contiguous
+        # slot layout, so they silently drop the write to the strided envelope
+        # pool. Run them on contiguous per-sequence copies (identity-indexed) and
+        # scatter the result back. No-op for the default contiguous pool.
+        # TODO(ch-wan): drop these .contiguous() copies by making the prefill conv
+        # and chunk_gated_delta_rule kernels honor the pool's real slot stride +
+        # int64 indexing, like packed_decode / causal_conv1d_update already do.
+        gather_mamba_state = (not is_target_verify) and (
+            not conv_states.is_contiguous() or not ssm_states.is_contiguous()
+        )
+        if gather_mamba_state:
+            conv_states_run = conv_states[cache_indices].contiguous()
+            ssm_states_run = ssm_states[cache_indices].contiguous()
+            state_cache_indices = torch.arange(
+                cache_indices.shape[0],
+                device=cache_indices.device,
+                dtype=cache_indices.dtype,
+            )
+        else:
+            conv_states_run = conv_states
+            ssm_states_run = ssm_states
+            state_cache_indices = cache_indices
+
         if is_target_verify:
             batch_size = seq_len // forward_batch.spec_info.draft_token_num
             draft_token_num = forward_batch.spec_info.draft_token_num
@@ -460,9 +484,9 @@ def forward_extend(
                 layer.conv_weights,
                 layer.bias,
                 activation=layer.activation,
-                conv_states=conv_states,
+                conv_states=conv_states_run,
                 has_initial_state=has_initial_states,
-                cache_indices=cache_indices,
+                cache_indices=state_cache_indices,
                 query_start_loc=query_start_loc,
                 seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
             ).transpose(0, 1)[:seq_len]
@@ -514,8 +538,8 @@ def forward_extend(
                 v=value,
                 g=g,
                 beta=beta,
-                ssm_states=ssm_states,
-                cache_indices=cache_indices,
+                ssm_states=ssm_states_run,
+                cache_indices=state_cache_indices,
                 query_start_loc=query_start_loc,
             )
 
@@ -525,6 +549,12 @@ def forward_extend(
                 )
                 ssm_states[cache_indices] = last_recurrent_state
 
+            if gather_mamba_state:
+                # Scatter the in-place-updated contiguous copies back to the
+                # strided envelope pool (advanced indexing handles the strides).
+                conv_states[cache_indices] = conv_states_run
+                ssm_states[cache_indices] = ssm_states_run
+
             if h is not None:
                 self._track_mamba_state_extend(
                     forward_batch, h, ssm_states, forward_metadata
 
@@ -43,9 +43,13 @@ def track_mamba_state_if_needed_kernel(
     if not track_mask:
         return
 
-    # Load source and destination indices
-    src_idx = tl.load(cache_indices_ptr + batch_idx)
-    dst_idx = tl.load(mamba_track_indices_ptr + batch_idx)
+    # Cast indices to int64 before they multiply the row stride. The
+    # page-granularity envelope layout makes the conv/ssm row stride large
+    # (stride_0 = entry_bytes / itemsize), so an int32 `idx * stride_0` can
+    # overflow for moderately large idx and wrap to an illegal address. int64 is
+    # harmless for the small-stride (per-layer) case.
+    src_idx = tl.load(cache_indices_ptr + batch_idx).to(tl.int64)
+    dst_idx = tl.load(mamba_track_indices_ptr + batch_idx).to(tl.int64)
 
     # Copy conv_states
     # Each thread handles BLOCK_SIZE elements
 
@@ -147,6 +147,12 @@ def __init__(
         self.req_to_token = model_runner.req_to_token_pool.req_to_token
         self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
         self.use_sliding_window_kv_pool = isinstance(self.token_to_kv_pool, SWAKVPool)
+        # Pass-through to the Triton attention wrappers so they can extract the
+        # KV view strides and specialize on the PAGE_SIZE constexpr. At
+        # page_size=1 the kernel path matches the slot-based envelope addresses.
+        # `model_runner.page_size` defaults to 1 when `server_args.page_size` is
+        # None, avoiding the Optional case here.
+        self.page_size = getattr(model_runner, "page_size", 1) or 1
         self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
         self.speculative_num_steps = model_runner.server_args.speculative_num_steps
         self.topk = model_runner.server_args.speculative_eagle_topk or 0
@@ -1306,6 +1312,7 @@ def forward_extend(
             sinks=sinks,
             window_kv_offsets=window_kv_offsets,
             xai_temperature_len=layer.xai_temperature_len,
+            page_size=self.page_size,
         )
         return o
 
@@ -1575,6 +1582,7 @@ def _forward_extend_unified(
             sinks=sinks,
             window_start_pos=window_start_pos,
             xai_temperature_len=layer.xai_temperature_len,
+            page_size=self.page_size,
         )
 
         return o
@@ -1710,6 +1718,7 @@ def forward_decode(
             xai_temperature_len=layer.xai_temperature_len,
             has_mla=self.use_mla,
             use_pdl=self.use_pdl,
+            page_size=self.page_size,
         )
         return o