fix: simplify determine_available_memory

rebel-jaehwang · rebel-jaehwang · commit 5d8bd7a80be4 · 2026-01-30T17:53:02.000+09:00
* We don't need to compute the number of blocks since vllm allocator
  already does it, properly considering diffferent layer types.
* Remove VLLM_RBLN_NPU_NUM_BLOCKS. User should use the standard
  gpu_memory_utilization config instead.
* Don't take min with max of memory used for active request, since
  prefix cache can utilize the extra memory.
diff --git a/vllm_rbln/v1/worker/rbln_worker.py b/vllm_rbln/v1/worker/rbln_worker.py
@@ -30,7 +30,7 @@
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
-from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec, FullAttentionSpec
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
                              DraftTokenIds, ModelRunnerOutput)
 from vllm.v1.utils import report_usage_stats
@@ -39,7 +39,7 @@
 import vllm_rbln.rbln_envs as envs
 from vllm_rbln.logger import init_logger
 from vllm_rbln.v1.worker.rbln_model_runner import RBLNModelRunner
-from vllm_rbln.worker.utils import get_maximum_num_blocks
+from vllm_rbln.worker.utils import estimate_available_memory
 
 logger = init_logger(__name__)
 
@@ -228,49 +228,21 @@ def determine_available_memory(self) -> int:
 
         # NOTE - model parallel(tp, dp, ep, pp) already applied into model params
         n_model_params = n_model_attentions + n_model_experts
-        block_size = self.cache_config.block_size
 
-        # This function comes from optimum-rbln.
-        # We must keep it updated as optimum is upgraded.
-        max_num_blocks = get_maximum_num_blocks(
+        available_memory_estimate = estimate_available_memory(
             model_config=self.model_config,
             parallel_config=self.parallel_config,
-            kvcache_block_size=block_size,
             # quantization : 4 (This is an ad-hoc value. Need to fix it)
             nbits_per_param=nbits_per_param,
             n_model_params=n_model_params,
-            num_runtimes=num_runtimes)
-
-        # NOTE -  adjust max_num_blocks considering swa block sharing
-        # max_num_blocks - based on FullAttentionSpec for model
-        # SHOULD adjust num blocks considering non full attent
-        kv_cache_spec = self.model_runner.get_kv_cache_spec()
-        page_size = max(spec.page_size_bytes
-                        for spec in kv_cache_spec.values())
-        num_layers = len(kv_cache_spec)
-        num_attn_layers = 0
-        for spec in kv_cache_spec.values():
-            num_attn_layers += int(isinstance(spec, FullAttentionSpec))
-        max_num_blocks = max_num_blocks * num_layers / num_attn_layers
-
-        # for partition skip, we need dummy block slot.
-        no_dummy_slots = 1
-        max_required_num_blocks = (self.model_config.max_model_len *
-                                   self.scheduler_config.max_num_seqs //
-                                   block_size) + no_dummy_slots
-        num_gpu_blocks = min(
-            int(max_num_blocks * self.cache_config.gpu_memory_utilization),
-            max_required_num_blocks)
-        logger.info("max_num_blocks(%s), required_num_blocks(%s), num_blocks(%s)",
-            max_num_blocks, max_required_num_blocks, num_gpu_blocks)
-
-        if npu_num_blocks := os.environ.get("VLLM_RBLN_NPU_NUM_BLOCKS"):
-            num_gpu_blocks = int(npu_num_blocks)
-
-        # NOTE - consider SWA hybrid models
-        # SWA shares blocks with Full Attention, DO NOT count SWA layers
-        available_memory = num_gpu_blocks * page_size * num_attn_layers
-        return available_memory
+            num_runtimes=num_runtimes,
+            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
+        )
+
+        logger.info("available_memory_estimate = %.2f GB",
+                    available_memory_estimate / 10**9)
+
+        return available_memory_estimate
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         return self.model_runner.get_kv_cache_spec()
diff --git a/vllm_rbln/worker/utils.py b/vllm_rbln/worker/utils.py
@@ -22,15 +22,16 @@
 import vllm_rbln.rbln_envs as envs
 
 
-def get_maximum_num_blocks(
+# NOTE: This function comes from optimum-rbln. Keep in sync.
+def estimate_available_memory(
     model_config: ModelConfig,
     parallel_config: ParallelConfig,
-    kvcache_block_size: int,
     nbits_per_param: Optional[int] = None,
     n_model_params: Optional[int] = None,
     kernel_size: Optional[int] = None,
     buffer: Optional[int] = None,
     num_runtimes: int = 2,
+    gpu_memory_utilization: float = 0.9,
 ) -> int:
     # We are finding max_num_blocks(x) that satisfies the following equation:
 
@@ -104,9 +105,12 @@ def align_2MB(x: int) -> int:
     else:
         assert False, "invalid RBLN architecture, candidates = [ATOM(ca), REBEL(cr)]"
 
+    available_dram_bytes = int(available_dram_bytes * gpu_memory_utilization)
+
     def check_oom(available_dram_bytes: int) -> None:
         if available_dram_bytes <= 0:
-            raise MemoryError("Insufficient DRAM during block calculation.")
+            raise MemoryError("Insufficient DRAM during block calculation. "
+                              "Try reducing gpu_memory_utilization.")
 
     if kernel_size is None:
         if n_model_params is None:
@@ -140,12 +144,4 @@ def check_oom(available_dram_bytes: int) -> None:
 
     check_oom(available_dram_bytes)
 
-    kv = 2
-    kv_bytes = 2
-    num_kv_heads = math.ceil(num_key_value_heads / rsd_size) * rsd_size
-    head_dim = align(head_dim, 64)
-    # [2(=kv), H(=num_kv_heads), 1, B(=block_size), D(=head_dim)]
-    kv_cache_block_bytes = kv * kvcache_block_size * head_dim * num_kv_heads * kv_bytes * num_layers
-    # for each k, v, max_num_blocks calculation is done
-    max_num_blocks = available_dram_bytes / kv_cache_block_bytes
-    return max_num_blocks
+    return available_dram_bytes