fix: sync with optimum-rbln fix (#15)

rebel-seinpark · web-flow · commit a547fe305a02 · 2025-07-23T19:51:44.000+09:00
* fix: bach 1 case

* ruff
diff --git a/vllm_rbln/worker/optimum_worker.py b/vllm_rbln/worker/optimum_worker.py
@@ -79,7 +79,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         if attn_impl is not None and attn_impl == "flash_attn":
             # We use the last block as dummy block
             num_gpu_blocks = (
-                self.model_runner.model.model.get_kvcache_num_blocks() - 1)
+                self.model_runner.model.model.get_kvcache_num_blocks() - 1) \
+                if self.model_runner.model.model.rbln_config.batch_size > 1 \
+                else (self.model_runner.model.model.get_kvcache_num_blocks())
 
             if npu_num_blocks := os.environ.get("VLLM_RBLN_NPU_NUM_BLOCKS"):
                 num_gpu_blocks = int(npu_num_blocks) - 1