Skip to content

Commit 5d8bd7a

Browse files
fix: simplify determine_available_memory
* We don't need to compute the number of blocks since vllm allocator already does it, properly considering diffferent layer types. * Remove VLLM_RBLN_NPU_NUM_BLOCKS. User should use the standard gpu_memory_utilization config instead. * Don't take min with max of memory used for active request, since prefix cache can utilize the extra memory.
1 parent bbf95a2 commit 5d8bd7a

2 files changed

Lines changed: 19 additions & 51 deletions

File tree

vllm_rbln/v1/worker/rbln_worker.py

Lines changed: 11 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from vllm.platforms import current_platform
3131
from vllm.sequence import IntermediateTensors
3232
from vllm.tasks import SupportedTask
33-
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec, FullAttentionSpec
33+
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
3434
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
3535
DraftTokenIds, ModelRunnerOutput)
3636
from vllm.v1.utils import report_usage_stats
@@ -39,7 +39,7 @@
3939
import vllm_rbln.rbln_envs as envs
4040
from vllm_rbln.logger import init_logger
4141
from vllm_rbln.v1.worker.rbln_model_runner import RBLNModelRunner
42-
from vllm_rbln.worker.utils import get_maximum_num_blocks
42+
from vllm_rbln.worker.utils import estimate_available_memory
4343

4444
logger = init_logger(__name__)
4545

@@ -228,49 +228,21 @@ def determine_available_memory(self) -> int:
228228

229229
# NOTE - model parallel(tp, dp, ep, pp) already applied into model params
230230
n_model_params = n_model_attentions + n_model_experts
231-
block_size = self.cache_config.block_size
232231

233-
# This function comes from optimum-rbln.
234-
# We must keep it updated as optimum is upgraded.
235-
max_num_blocks = get_maximum_num_blocks(
232+
available_memory_estimate = estimate_available_memory(
236233
model_config=self.model_config,
237234
parallel_config=self.parallel_config,
238-
kvcache_block_size=block_size,
239235
# quantization : 4 (This is an ad-hoc value. Need to fix it)
240236
nbits_per_param=nbits_per_param,
241237
n_model_params=n_model_params,
242-
num_runtimes=num_runtimes)
243-
244-
# NOTE - adjust max_num_blocks considering swa block sharing
245-
# max_num_blocks - based on FullAttentionSpec for model
246-
# SHOULD adjust num blocks considering non full attent
247-
kv_cache_spec = self.model_runner.get_kv_cache_spec()
248-
page_size = max(spec.page_size_bytes
249-
for spec in kv_cache_spec.values())
250-
num_layers = len(kv_cache_spec)
251-
num_attn_layers = 0
252-
for spec in kv_cache_spec.values():
253-
num_attn_layers += int(isinstance(spec, FullAttentionSpec))
254-
max_num_blocks = max_num_blocks * num_layers / num_attn_layers
255-
256-
# for partition skip, we need dummy block slot.
257-
no_dummy_slots = 1
258-
max_required_num_blocks = (self.model_config.max_model_len *
259-
self.scheduler_config.max_num_seqs //
260-
block_size) + no_dummy_slots
261-
num_gpu_blocks = min(
262-
int(max_num_blocks * self.cache_config.gpu_memory_utilization),
263-
max_required_num_blocks)
264-
logger.info("max_num_blocks(%s), required_num_blocks(%s), num_blocks(%s)",
265-
max_num_blocks, max_required_num_blocks, num_gpu_blocks)
266-
267-
if npu_num_blocks := os.environ.get("VLLM_RBLN_NPU_NUM_BLOCKS"):
268-
num_gpu_blocks = int(npu_num_blocks)
269-
270-
# NOTE - consider SWA hybrid models
271-
# SWA shares blocks with Full Attention, DO NOT count SWA layers
272-
available_memory = num_gpu_blocks * page_size * num_attn_layers
273-
return available_memory
238+
num_runtimes=num_runtimes,
239+
gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
240+
)
241+
242+
logger.info("available_memory_estimate = %.2f GB",
243+
available_memory_estimate / 10**9)
244+
245+
return available_memory_estimate
274246

275247
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
276248
return self.model_runner.get_kv_cache_spec()

vllm_rbln/worker/utils.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,16 @@
2222
import vllm_rbln.rbln_envs as envs
2323

2424

25-
def get_maximum_num_blocks(
25+
# NOTE: This function comes from optimum-rbln. Keep in sync.
26+
def estimate_available_memory(
2627
model_config: ModelConfig,
2728
parallel_config: ParallelConfig,
28-
kvcache_block_size: int,
2929
nbits_per_param: Optional[int] = None,
3030
n_model_params: Optional[int] = None,
3131
kernel_size: Optional[int] = None,
3232
buffer: Optional[int] = None,
3333
num_runtimes: int = 2,
34+
gpu_memory_utilization: float = 0.9,
3435
) -> int:
3536
# We are finding max_num_blocks(x) that satisfies the following equation:
3637

@@ -104,9 +105,12 @@ def align_2MB(x: int) -> int:
104105
else:
105106
assert False, "invalid RBLN architecture, candidates = [ATOM(ca), REBEL(cr)]"
106107

108+
available_dram_bytes = int(available_dram_bytes * gpu_memory_utilization)
109+
107110
def check_oom(available_dram_bytes: int) -> None:
108111
if available_dram_bytes <= 0:
109-
raise MemoryError("Insufficient DRAM during block calculation.")
112+
raise MemoryError("Insufficient DRAM during block calculation. "
113+
"Try reducing gpu_memory_utilization.")
110114

111115
if kernel_size is None:
112116
if n_model_params is None:
@@ -140,12 +144,4 @@ def check_oom(available_dram_bytes: int) -> None:
140144

141145
check_oom(available_dram_bytes)
142146

143-
kv = 2
144-
kv_bytes = 2
145-
num_kv_heads = math.ceil(num_key_value_heads / rsd_size) * rsd_size
146-
head_dim = align(head_dim, 64)
147-
# [2(=kv), H(=num_kv_heads), 1, B(=block_size), D(=head_dim)]
148-
kv_cache_block_bytes = kv * kvcache_block_size * head_dim * num_kv_heads * kv_bytes * num_layers
149-
# for each k, v, max_num_blocks calculation is done
150-
max_num_blocks = available_dram_bytes / kv_cache_block_bytes
151-
return max_num_blocks
147+
return available_dram_bytes

0 commit comments

Comments
 (0)