|
30 | 30 | from vllm.platforms import current_platform |
31 | 31 | from vllm.sequence import IntermediateTensors |
32 | 32 | from vllm.tasks import SupportedTask |
33 | | -from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec, FullAttentionSpec |
| 33 | +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec |
34 | 34 | from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, |
35 | 35 | DraftTokenIds, ModelRunnerOutput) |
36 | 36 | from vllm.v1.utils import report_usage_stats |
|
39 | 39 | import vllm_rbln.rbln_envs as envs |
40 | 40 | from vllm_rbln.logger import init_logger |
41 | 41 | from vllm_rbln.v1.worker.rbln_model_runner import RBLNModelRunner |
42 | | -from vllm_rbln.worker.utils import get_maximum_num_blocks |
| 42 | +from vllm_rbln.worker.utils import estimate_available_memory |
43 | 43 |
|
44 | 44 | logger = init_logger(__name__) |
45 | 45 |
|
@@ -228,49 +228,21 @@ def determine_available_memory(self) -> int: |
228 | 228 |
|
229 | 229 | # NOTE - model parallel(tp, dp, ep, pp) already applied into model params |
230 | 230 | n_model_params = n_model_attentions + n_model_experts |
231 | | - block_size = self.cache_config.block_size |
232 | 231 |
|
233 | | - # This function comes from optimum-rbln. |
234 | | - # We must keep it updated as optimum is upgraded. |
235 | | - max_num_blocks = get_maximum_num_blocks( |
| 232 | + available_memory_estimate = estimate_available_memory( |
236 | 233 | model_config=self.model_config, |
237 | 234 | parallel_config=self.parallel_config, |
238 | | - kvcache_block_size=block_size, |
239 | 235 | # quantization : 4 (This is an ad-hoc value. Need to fix it) |
240 | 236 | nbits_per_param=nbits_per_param, |
241 | 237 | n_model_params=n_model_params, |
242 | | - num_runtimes=num_runtimes) |
243 | | - |
244 | | - # NOTE - adjust max_num_blocks considering swa block sharing |
245 | | - # max_num_blocks - based on FullAttentionSpec for model |
246 | | - # SHOULD adjust num blocks considering non full attent |
247 | | - kv_cache_spec = self.model_runner.get_kv_cache_spec() |
248 | | - page_size = max(spec.page_size_bytes |
249 | | - for spec in kv_cache_spec.values()) |
250 | | - num_layers = len(kv_cache_spec) |
251 | | - num_attn_layers = 0 |
252 | | - for spec in kv_cache_spec.values(): |
253 | | - num_attn_layers += int(isinstance(spec, FullAttentionSpec)) |
254 | | - max_num_blocks = max_num_blocks * num_layers / num_attn_layers |
255 | | - |
256 | | - # for partition skip, we need dummy block slot. |
257 | | - no_dummy_slots = 1 |
258 | | - max_required_num_blocks = (self.model_config.max_model_len * |
259 | | - self.scheduler_config.max_num_seqs // |
260 | | - block_size) + no_dummy_slots |
261 | | - num_gpu_blocks = min( |
262 | | - int(max_num_blocks * self.cache_config.gpu_memory_utilization), |
263 | | - max_required_num_blocks) |
264 | | - logger.info("max_num_blocks(%s), required_num_blocks(%s), num_blocks(%s)", |
265 | | - max_num_blocks, max_required_num_blocks, num_gpu_blocks) |
266 | | - |
267 | | - if npu_num_blocks := os.environ.get("VLLM_RBLN_NPU_NUM_BLOCKS"): |
268 | | - num_gpu_blocks = int(npu_num_blocks) |
269 | | - |
270 | | - # NOTE - consider SWA hybrid models |
271 | | - # SWA shares blocks with Full Attention, DO NOT count SWA layers |
272 | | - available_memory = num_gpu_blocks * page_size * num_attn_layers |
273 | | - return available_memory |
| 238 | + num_runtimes=num_runtimes, |
| 239 | + gpu_memory_utilization=self.cache_config.gpu_memory_utilization, |
| 240 | + ) |
| 241 | + |
| 242 | + logger.info("available_memory_estimate = %.2f GB", |
| 243 | + available_memory_estimate / 10**9) |
| 244 | + |
| 245 | + return available_memory_estimate |
274 | 246 |
|
275 | 247 | def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: |
276 | 248 | return self.model_runner.get_kv_cache_spec() |
|
0 commit comments