fix(core): block allocation for torch compile path (#77)

huijjj · web-flow · commit 575a316edbec · 2025-09-05T08:42:44.000+09:00
* fix(v1): use reserved null block for padding

* fix(v0): match min block requirements with scheduling heuristics

* lint: format codes
diff --git a/vllm_rbln/v1/attention/backends/flash_attention.py b/vllm_rbln/v1/attention/backends/flash_attention.py
@@ -369,7 +369,7 @@ def build(
                 block_table_tensor,
                 torch.full(
                     (batch_padding_size, block_table_tensor.shape[-1]),
-                    block_table_tensor.numel() - 1,
+                    0,
                 ),
             ])
             decode_attention_mask = torch.zeros(
diff --git a/vllm_rbln/v1/worker/rbln_model_runner.py b/vllm_rbln/v1/worker/rbln_model_runner.py
@@ -1581,10 +1581,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         self.initialize_attn_backend(kv_cache_config)
         kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
 
-        # for partition skip, we need dummy block slot.
-        no_dummy_slots = 1
-        kv_cache_config.num_blocks -= no_dummy_slots
-
         if self.speculative_config and self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
             # validate all draft model layers belong to the same kv cache
diff --git a/vllm_rbln/worker/worker.py b/vllm_rbln/worker/worker.py
@@ -99,8 +99,10 @@ def __init__(
 
     def _allocate_kv_cache(self, ) -> List[torch.Tensor]:
         """Allocates KV cache on RBLN."""
+
+        # One extra block is reserved for padding.
         kv_cache_shape = self.attn_backend.get_kv_cache_shape(
-            self.num_cpu_blocks, self.block_size, self.num_heads,
+            self.num_cpu_blocks + 1, self.block_size, self.num_heads,
             self.head_size)
         kv_cache: List[torch.Tensor] = []
         logger.info("[RBLN] attention backend get_kv_cache_shape = %s",
@@ -303,12 +305,16 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             # 1 : prefill
             num_runtimes=1 + self.scheduler_config.max_num_seqs)
 
-        max_required_num_blocks = (self.model_config.max_model_len *
-                                   self.scheduler_config.max_num_seqs //
-                                   block_size)
-
-        num_gpu_blocks = min(max_num_blocks - 1, max_required_num_blocks)
+        max_required_num_blocks = (
+            self.model_config.max_model_len *
+            self.scheduler_config.max_num_seqs //
+            block_size) + self.scheduler_config.max_num_seqs + 1
 
+        # We always allocate this number of blocks, but the last one is
+        # reserved for padding. As a result, the vLLM system should treat
+        # it as if there is one fewer usable block than the number
+        # actually allocated.
+        num_gpu_blocks = min(max_num_blocks, max_required_num_blocks) - 1
         if npu_num_blocks := os.environ.get("VLLM_RBLN_NPU_NUM_BLOCKS"):
             num_gpu_blocks = int(npu_num_blocks) - 1