@@ -99,8 +99,10 @@ def __init__(
9999
100100 def _allocate_kv_cache (self , ) -> List [torch .Tensor ]:
101101 """Allocates KV cache on RBLN."""
102+
103+ # One extra block is reserved for padding.
102104 kv_cache_shape = self .attn_backend .get_kv_cache_shape (
103- self .num_cpu_blocks , self .block_size , self .num_heads ,
105+ self .num_cpu_blocks + 1 , self .block_size , self .num_heads ,
104106 self .head_size )
105107 kv_cache : List [torch .Tensor ] = []
106108 logger .info ("[RBLN] attention backend get_kv_cache_shape = %s" ,
@@ -303,12 +305,16 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
303305 # 1 : prefill
304306 num_runtimes = 1 + self .scheduler_config .max_num_seqs )
305307
306- max_required_num_blocks = (self .model_config .max_model_len *
307- self .scheduler_config .max_num_seqs //
308- block_size )
309-
310- num_gpu_blocks = min (max_num_blocks - 1 , max_required_num_blocks )
308+ max_required_num_blocks = (
309+ self .model_config .max_model_len *
310+ self .scheduler_config .max_num_seqs //
311+ block_size ) + self .scheduler_config .max_num_seqs + 1
311312
313+ # We always allocate this number of blocks, but the last one is
314+ # reserved for padding. As a result, the vLLM system should treat
315+ # it as if there is one fewer usable block than the number
316+ # actually allocated.
317+ num_gpu_blocks = min (max_num_blocks , max_required_num_blocks ) - 1
312318 if npu_num_blocks := os .environ .get ("VLLM_RBLN_NPU_NUM_BLOCKS" ):
313319 num_gpu_blocks = int (npu_num_blocks ) - 1
314320
0 commit comments