refactor(scheduler): delay caching instead of undoing (#525)

rebel-jaehwang · web-flow · commit bf8c8ad21c3c · 2026-04-13T22:43:59.000+09:00
Problem:
The scheduler speculatively cached blocks during allocate_slots, then
had to undo the caching (via undo_uncomputed_block_caching) in three
places: spec_decode_cap trimming, prefill over-allocation for chunked
prefill, and prefill preempting running decodes. This was error-prone
and coupled the scheduler to KVCacheManager internals (block_pool,
num_cached_block).

Solution:
Pass delay_cache_blocks=True to all allocate_slots calls so no blocks
are cached during allocation. A single finalization loop after all
scheduling decisions calls cache_blocks and schedule_sub_block_indexing
for each actually-scheduled request. This eliminates
undo_uncomputed_block_caching.
diff --git a/docs/sub_block_prefix_caching.md b/docs/sub_block_prefix_caching.md
@@ -73,7 +73,9 @@ so that each prefill does not span multiple blocks.
    * `RBLNKVCacheManager`: Extends upstream `KVCacheManager`.
         * Overrides
             * `allocate_slots` queues the request for sub-block indexing work
-              to be processed by `do_pending_indexing`.
+              when `delay_cache_blocks=False`.
+              When `delay_cache_blocks=True`, the caller must call
+              `schedule_sub_block_indexing()` after `cache_blocks()`.
             * `free` indexes all blocks (full + partial) for the finishing request
               and assigns a synthetic `block_hash` to the partial block.
             * `reset_prefix_cache` clears sub-block indices and pending indexing.
@@ -83,8 +85,9 @@ so that each prefill does not span multiple blocks.
             * `apply_sub_block_match` / `release_sub_block_match` consume or discard the handle.
             * `drain_pending_copy_ops()` retrieves the KV cache copy ops accumulated in the current scheduling step.
             * `release_copy_ops()` releases the source-block references after the model runner finishes copying.
-            * `do_pending_indexing()` indexes sub-blocks for requests for which
-              `allocate_slots` was called in the current scheduling step.
+            * `schedule_sub_block_indexing(request)` records that a request
+              needs sub-block indexing.
+            * `do_pending_indexing()` executes the scheduled indexing work.
               Must be called after `super().update_from_output()`.
             * `can_use_sub_block_caching()` checks eligibility.
    * `KVCacheCopyOp`: Dataclass describing a sub-block KV data copy:
@@ -130,7 +133,12 @@ Sub-block indexing is **deferred** until after the forward pass writes KV data.
 This ensures that concurrent prefills in the same scheduling step cannot match
 sub-blocks whose KV data has not yet been computed and thus should not be copied.
 
-During `allocate_slots`, requests are queued for deferred indexing.
+When a scheduler schedules a request, it should schedule sub-block indexing for that request.
+This is done automatically when `allocate_slots` is called with `delay_cache_blocks=False`.
+If `delay_cache_blocks=True`, the user must call `schedule_sub_block_indexing()` after upstream `cache_blocks()`.
+The current implementation of `RBLNScheduler` uses the latter approach,
+because its complex scheduling logic requires manual control over full block caching.
+
 `RBLNScheduler.update_from_output` first calls `super().update_from_output()`
 (which updates `num_computed_tokens` and `free()`s finished requests),
 then calls `do_pending_indexing` for the remaining running requests.
@@ -170,12 +178,10 @@ produce logits).
 ### Step 4: Copy op scheduling
 
 After `allocate_slots` succeeds, the scheduler calls
-`apply_sub_block_match(match, request)` which, for each group:
+`apply_sub_block_match(match)` which, for each group:
 1.  Looks up the destination block (newly allocated at the match boundary)
 2.  Appends `KVCacheCopyOp(group_id, src_block_id, dst_block_id, num_tokens)`
 
-(`allocate_slots` itself only queues deferred sub-block indexing.)
-
 ### Step 5: Copy execution (model runner)
 
 The scheduler returns `RBLNSchedulerOutput` containing `kv_cache_copy_ops`.
@@ -191,7 +197,7 @@ kv_cache[:, dst_block_id, :, :, :num_tokens, :] = \
 ### Block lifecycle
 
 - **Indexing running requests**:
-  Scheduled by `allocate_slots`, then executed by `do_pending_indexing`
+  Scheduled by `allocate_slots` or by user, then executed by `do_pending_indexing`
   (called after `super().update_from_output()`).
   Indexes both full blocks and complete sub-blocks within partial blocks.
 - **Indexing finished requests**: `free()` consumes the pending-indexing
diff --git a/tests/torch_compile/unit/v1/core/test_prefix_caching.py b/tests/torch_compile/unit/v1/core/test_prefix_caching.py
@@ -138,6 +138,46 @@ def test_preallocation_in_prefill():
     )
 
 
+def test_chunked_prefill_caches_blocks_progressively():
+    # With delay_cache_blocks + finalization, verify that each prefill
+    # chunk caches exactly the computed blocks.
+    block_size = 16
+    num_blocks_per_request = 4
+
+    scheduler = create_scheduler(
+        block_size=block_size,
+        max_num_batched_tokens=block_size,  # 1 block per step
+        enable_prefix_caching=True,
+        max_model_len=num_blocks_per_request * block_size * 2,
+    )
+    mgr = scheduler.kv_cache_manager
+
+    req = create_requests(
+        1,
+        num_tokens=num_blocks_per_request * block_size,
+        max_tokens=1,
+        same_prompt=True,
+    )[0]
+    scheduler.add_request(req)
+
+    for step in range(num_blocks_per_request):
+        output = scheduler.schedule()
+
+        # All blocks are pre-allocated from step 0.
+        blocks = mgr.get_blocks(req.request_id).blocks[0]
+        assert len(blocks) == num_blocks_per_request
+
+        # After finalization: exactly (step+1) blocks should be cached.
+        cached = [b for b in blocks if b.block_hash is not None]
+        uncached = [b for b in blocks if b.block_hash is None]
+        assert len(cached) == step + 1
+        assert len(uncached) == num_blocks_per_request - step - 1
+
+        is_last = step == num_blocks_per_request - 1
+        runner_out = create_runner_output(output, 0 if is_last else None)
+        scheduler.update_from_output(output, runner_out)
+
+
 def test_preallocation_in_decode():
     # test that block preallocation during the decode phase
     # does not break prefix caching functionality
diff --git a/tests/torch_compile/unit/v1/core/test_sub_block_prefix_caching.py b/tests/torch_compile/unit/v1/core/test_sub_block_prefix_caching.py
@@ -1831,9 +1831,10 @@ def test_multi_turn_conversation(self):
         expected_scheduled = req2.num_tokens - num_computed
         assert output2.num_scheduled_tokens["turn2"] == expected_scheduled
 
-    def test_speculative_alloc_undo_cleans_sub_block_index(self):
-        """Speculative allocate_slots + undo_uncomputed_block_caching must not
-        leave stale sub-block index entries for blocks that were never computed.
+    def test_speculative_alloc_does_not_index_uncomputed_blocks(self):
+        """Pre-allocated but uncomputed blocks must not appear in the
+        sub-block index. With delay_cache_blocks=True, only blocks that
+        are explicitly cached in the finalization step get indexed.
         """
         BS = self.BLOCK_SIZE  # 16
         SBS = self.SUB_BLOCK_SIZE  # 4
@@ -1846,9 +1847,9 @@ def test_speculative_alloc_undo_cleans_sub_block_index(self):
         # 3 full blocks + 1 partial block.
         # The scheduler pre-allocates blocks for ALL tokens but only computes
         # one chunk (BS tokens) per iteration.
-        # After undo_uncomputed_block_caching:
+        # With delay_cache_blocks + finalization:
         #   block 0: computed and indexed
-        #   blocks 1-2: full but uncomputed and unindexed
+        #   blocks 1-2: full, never got cached/indexed
         #   block 3: partial, never got indexed
         tokens = list(range(3 * BS + SBS))
         req = _make_request("req", tokens, BS, max_tokens=1)
diff --git a/vllm_rbln/v1/core/rbln_kv_cache_manager.py b/vllm_rbln/v1/core/rbln_kv_cache_manager.py
@@ -469,10 +469,6 @@ def allocate_slots(
         delay_cache_blocks: bool = False,
         num_encoder_tokens: int = 0,
     ) -> KVCacheBlocks | None:
-        num_full_blocks_before = tuple(
-            request.num_computed_tokens // gi.block_size for gi in self._group_infos
-        )
-
         result = super().allocate_slots(
             request,
             num_new_tokens,
@@ -484,17 +480,28 @@ def allocate_slots(
             num_encoder_tokens,
         )
 
-        if result is not None:
-            # Defer sub-block indexing until after execute_model writes KV cache,
-            # so that concurrent prefills in the same step cannot match sub-blocks
-            # whose KV data does not yet exist.
-            self._pending_indexing[request.request_id] = (
-                request,
-                num_full_blocks_before,
-            )
+        if result is not None and not delay_cache_blocks:
+            # When delay_cache_blocks=True, the caller is responsible for
+            # calling schedule_sub_block_indexing() after cache_blocks().
+            self.schedule_sub_block_indexing(request)
 
         return result
 
+    def schedule_sub_block_indexing(self, request: Request) -> None:
+        """Record that *request* needs sub-block indexing in the next
+        ``do_pending_indexing`` call.
+
+        When ``allocate_slots`` is called with ``delay_cache_blocks=False``,
+        this is called automatically.  Otherwise the caller must call it
+        """
+        num_full_blocks_before = tuple(
+            request.num_computed_tokens // gi.block_size for gi in self._group_infos
+        )
+        self._pending_indexing[request.request_id] = (
+            request,
+            num_full_blocks_before,
+        )
+
     def drain_pending_copy_ops(self) -> list[KVCacheCopyOp]:
         """Return and clear all pending copy operations.
 
@@ -579,7 +586,7 @@ def _get_or_compute_sub_hashes(self, request: Request) -> list[BlockHash]:
     def _index_newly_cached_blocks(
         self, request: Request, num_full_blocks_before: tuple[int, ...]
     ) -> None:
-        """After allocate_slots caches new full blocks, index their sub-blocks."""
+        """Index sub-blocks for newly cached full blocks since the last call."""
         blocks = self.coordinator.get_blocks(request.request_id)
         for gi, block_list, before in zip(
             self._group_infos, blocks, num_full_blocks_before
diff --git a/vllm_rbln/v1/core/rbln_scheduler.py b/vllm_rbln/v1/core/rbln_scheduler.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import itertools
 import time
 from dataclasses import dataclass, field
 
 from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.utils.hashing import get_hash_fn_by_name
-from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import init_none_hash
 from vllm.v1.core.sched.interface import PauseState
 from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
@@ -52,30 +53,6 @@ def is_prefill(request: Request) -> bool:
     return request.num_computed_tokens < request.num_tokens - 1
 
 
-def undo_uncomputed_block_caching(
-    request: Request,
-    kv_cache_manager: KVCacheManager,
-    num_computed_tokens: int | None = None,
-) -> None:
-    grouped_blocks = kv_cache_manager.get_blocks(request.request_id).blocks
-    num_computed_blocks = [
-        (num_computed_tokens or request.num_computed_tokens)
-        // group.kv_cache_spec.block_size
-        for group in kv_cache_manager.kv_cache_config.kv_cache_groups
-    ]
-    for blocks, num_full_block in zip(grouped_blocks, num_computed_blocks):
-        for block in blocks[num_full_block:]:
-            # NOTE(RBLN): this function call efficiently resets
-            # the block hash and evicts the corresponding block from the cache.
-            kv_cache_manager.block_pool._maybe_evict_cached_block(block)
-
-        for manager in kv_cache_manager.coordinator.single_type_managers:
-            # NOTE(RBLN): SingleTypeKVCacheManager instances track the number of
-            # cached blocks of running requests in num_cached_block dictionary.
-            if request.request_id in manager.num_cached_block:
-                manager.num_cached_block[request.request_id] = num_full_block
-
-
 class RBLNScheduler(Scheduler):
     def __init__(
         self,
@@ -264,6 +241,8 @@ def schedule(self) -> RBLNSchedulerOutput:
                         request,
                         num_new_tokens,
                         num_lookahead_tokens=self.num_lookahead_tokens,
+                        # NOTE(RBLN): Cache blocks only after scheduling is finalized.
+                        delay_cache_blocks=True,
                     )
 
                     if new_blocks is not None:
@@ -390,14 +369,6 @@ def schedule(self) -> RBLNSchedulerOutput:
                     continue
                 new_n = spec_decode_cap
 
-                # Extra blocks were allocated for the original token count but
-                # are no longer needed. Invalidate their prefix cache hash so
-                # they are not reused incorrectly; the blocks remain allocated
-                # and will be reused when this request needs them in a future step.
-                undo_uncomputed_block_caching(
-                    req, self.kv_cache_manager, req.num_computed_tokens + new_n
-                )
-
                 token_budget += old_n - new_n
                 num_scheduled_tokens[req_id] = new_n
 
@@ -643,7 +614,8 @@ def schedule(self) -> RBLNSchedulerOutput:
                     new_computed_blocks=new_computed_blocks,
                     num_lookahead_tokens=effective_lookahead_tokens,
                     num_external_computed_tokens=num_external_computed_tokens,
-                    delay_cache_blocks=load_kv_async,
+                    # NOTE(RBLN): Cache blocks only after scheduling is finalized.
+                    delay_cache_blocks=True,
                     num_encoder_tokens=num_encoder_tokens,
                 )
 
@@ -662,18 +634,6 @@ def schedule(self) -> RBLNSchedulerOutput:
                     self.kv_cache_manager.apply_sub_block_match(sub_block_match)
                     sub_block_match = None
 
-                # NOTE(RBLN): By calling allocate_slots with
-                # request.num_tokens - num_computed_tokens instead of num_new_tokens,
-                # we pre-allocate slots for all tokens that this request will prefill.
-                # If allocated slots end up filling a block, the block hash would also
-                # would be written down. However, since this iteration may not actually
-                # compute all tokens, the block may not be fully computed. Therefore,
-                # if the block is not finalized in this iteration, we must clear the
-                # block hash and undo block caching.
-                undo_uncomputed_block_caching(
-                    request, self.kv_cache_manager, num_computed_tokens + num_new_tokens
-                )
-
                 # KVTransfer: the connector uses this info to determine
                 # if a load is needed. Note that
                 # This information is used to determine if a load is
@@ -763,20 +723,14 @@ def schedule(self) -> RBLNSchedulerOutput:
                 # current step. In the next step (or after this request’s prefill
                 # completes if it cannot finish within a single step) this request will
                 # be scheduled together with the other running requests in the decoding
-                # phase. We also clear the block hash written in previous allocate_slots
-                # and undo block caching because this request and its tokens will be
-                # scheduled again, and allocate_slots will be invoked once more and the
-                # logic that writes the block hash will run again. Without clearing it
-                # here, an assertion error would occur because a block hash would
-                # already exist.
+                # phase.
                 for req in scheduled_running_reqs:
                     req_to_new_blocks.pop(req.request_id)
                     num_scheduled_tokens.pop(req.request_id)
                     req.spec_token_ids = scheduled_spec_decode_tokens.pop(
                         req.request_id, []
                     )
                     scheduled_encoder_inputs.pop(req.request_id, None)
-                    undo_uncomputed_block_caching(req, self.kv_cache_manager)
 
                 scheduled_running_reqs.clear()
                 token_budget = prefill_token_budget
@@ -807,6 +761,26 @@ def schedule(self) -> RBLNSchedulerOutput:
             scheduled_running_reqs
         ) <= len(self.running)
 
+        # NOTE(RBLN): All allocate_slots calls above used delay_cache_blocks=True
+        # so that scheduling decisions (spec_decode_cap trimming, prefill kicking
+        # out running decodes) can adjust token counts without needing to undo
+        # premature caching. Now that scheduling is finalized, cache blocks and
+        # schedule sub-block indexing for all scheduled requests.
+        for req in itertools.chain(
+            scheduled_running_reqs, scheduled_new_reqs, scheduled_resumed_reqs
+        ):
+            self.kv_cache_manager.cache_blocks(
+                req,
+                # Cap at req.num_tokens to exclude unverified spec decode
+                # draft tokens, matching the upstream allocate_slots behavior.
+                min(
+                    req.num_computed_tokens + num_scheduled_tokens[req.request_id],
+                    req.num_tokens,
+                ),
+            )
+            if isinstance(self.kv_cache_manager, RBLNKVCacheManager):
+                self.kv_cache_manager.schedule_sub_block_indexing(req)
+
         # Get the longest common prefix among all requests in the running queue.
         # This can be potentially used for cascade attention.
         num_common_prefix_blocks = [0] * len(self.kv_cache_config.kv_cache_groups)