Skip to content

Commit 4f02d3e

Browse files
authored
Fix/paged memory check psutil (#197)
This PR is: - To replace psutil.virtual_memory() with mx.device_info()["max_recommended_working_set_size"] as the KV cache budget ceiling - To extract _kv_budget_bytes as a testable static method with unit tests covering normal, negative, zero boundary, and real-world model scenarios - To validate VLLM_METAL_MEMORY_FRACTION range at config construction and fail fast when Metal working set size is unavailable Note: Found it while testing paged attention with GLM-4.7-Flash-4bit. No fraction value could satisfy the old check so traced to psutil.available being blind to MLX wired memory. On an M2 Max with the model loaded, psutil reports ~2.2 GB free while Metal has ~20 GB of headroom. --------- Signed-off-by: Yuan Lik Xun <lxyuan0420@gmail.com>
1 parent e43e90f commit 4f02d3e

File tree

4 files changed

+134
-33
lines changed

4 files changed

+134
-33
lines changed

tests/test_config.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,26 @@ def test_block_size_must_be_positive(self) -> None:
136136
os.environ["VLLM_METAL_BLOCK_SIZE"] = value
137137
with pytest.raises(ValueError, match="Invalid VLLM_METAL_BLOCK_SIZE"):
138138
MetalConfig.from_env()
139+
140+
def test_fraction_above_one_rejected(self) -> None:
141+
with pytest.raises(ValueError, match="Invalid VLLM_METAL_MEMORY_FRACTION"):
142+
MetalConfig(
143+
memory_fraction=1.5,
144+
use_mlx=False,
145+
mlx_device="gpu",
146+
block_size=16,
147+
debug=False,
148+
use_paged_attention=True,
149+
)
150+
151+
def test_fraction_zero_or_negative_rejected(self) -> None:
152+
for fraction in [0.0, -0.1]:
153+
with pytest.raises(ValueError, match="Invalid VLLM_METAL_MEMORY_FRACTION"):
154+
MetalConfig(
155+
memory_fraction=fraction,
156+
use_mlx=False,
157+
mlx_device="gpu",
158+
block_size=16,
159+
debug=False,
160+
use_paged_attention=True,
161+
)

tests/test_platform.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from vllm.v1.attention.backends.registry import AttentionBackendEnum
1010
from vllm.v1.attention.selector import AttentionSelectorConfig
1111

12+
from vllm_metal.config import PAGED_ATTENTION_OVERHEAD_BYTES
1213
from vllm_metal.platform import MetalPlatform
14+
from vllm_metal.v1.worker import MetalWorker
1315

1416

1517
class TestMetalPlatform:
@@ -292,3 +294,62 @@ def fake_eval(_value: object) -> None:
292294

293295
MetalPlatform.synchronize()
294296
assert called is True
297+
298+
299+
class TestKvBudgetBytes:
300+
"""Tests for MetalWorker._kv_budget_bytes.
301+
302+
Numbers mirror a real M2 Max with GLM-4.7-Flash-4bit loaded:
303+
metal_limit = 22.9 GB (max_recommended_working_set_size)
304+
model_memory = 16.85 GB (mx.get_active_memory() after load)
305+
"""
306+
307+
_METAL_LIMIT = int(22.9e9)
308+
_MODEL_MEM = int(16.85e9)
309+
310+
def test_normal_case(self) -> None:
311+
budget = MetalWorker._kv_budget_bytes(
312+
self._METAL_LIMIT, self._MODEL_MEM, fraction=0.9
313+
)
314+
315+
assert (
316+
budget
317+
== int(self._METAL_LIMIT * 0.9)
318+
- self._MODEL_MEM
319+
- PAGED_ATTENTION_OVERHEAD_BYTES
320+
)
321+
assert budget > 0
322+
323+
def test_fraction_too_low_yields_negative_budget(self) -> None:
324+
# fraction=0.3 → usable=6.9 GB < model(16.85 GB) → negative
325+
budget = MetalWorker._kv_budget_bytes(
326+
self._METAL_LIMIT, self._MODEL_MEM, fraction=0.3
327+
)
328+
329+
assert budget < 0
330+
331+
def test_boundary_zero(self) -> None:
332+
# Craft inputs so budget lands exactly at zero.
333+
limit = self._MODEL_MEM + PAGED_ATTENTION_OVERHEAD_BYTES
334+
335+
budget = MetalWorker._kv_budget_bytes(limit, self._MODEL_MEM, fraction=1.0)
336+
337+
assert budget == 0
338+
339+
def test_custom_overhead(self) -> None:
340+
budget_zero_overhead = MetalWorker._kv_budget_bytes(
341+
self._METAL_LIMIT, self._MODEL_MEM, fraction=0.9, overhead=0
342+
)
343+
budget_default = MetalWorker._kv_budget_bytes(
344+
self._METAL_LIMIT, self._MODEL_MEM, fraction=0.9
345+
)
346+
347+
assert budget_zero_overhead - budget_default == PAGED_ATTENTION_OVERHEAD_BYTES
348+
349+
def test_large_model_has_positive_budget_at_default_fraction(self) -> None:
350+
# GLM-4.7-Flash-4bit at fraction=0.9 must yield > 1 GB for KV cache.
351+
budget = MetalWorker._kv_budget_bytes(
352+
self._METAL_LIMIT, self._MODEL_MEM, fraction=0.9
353+
)
354+
355+
assert budget > 1e9

vllm_metal/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@ def __post_init__(self) -> None:
4747
"The MLX path must use VLLM_METAL_MEMORY_FRACTION=auto."
4848
)
4949

50+
if self.use_paged_attention and not self.is_auto_memory:
51+
if not (0 < self.memory_fraction <= 1):
52+
raise ValueError(
53+
f"Invalid VLLM_METAL_MEMORY_FRACTION={self.memory_fraction}. "
54+
"Must be a finite value in (0, 1] when paged attention is enabled."
55+
)
56+
5057
@property
5158
def is_auto_memory(self) -> bool:
5259
"""Check if memory fraction is set to auto mode."""

vllm_metal/v1/worker.py

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -147,15 +147,28 @@ def load_model(self) -> None:
147147
):
148148
self._setup_paged_attention()
149149

150+
@staticmethod
151+
def _kv_budget_bytes(
152+
metal_limit: int,
153+
model_memory: int,
154+
fraction: float,
155+
overhead: int = PAGED_ATTENTION_OVERHEAD_BYTES,
156+
) -> int:
157+
"""KV cache budget = fraction of Metal limit minus model and overhead.
158+
159+
All three quantities live in the same domain: Metal-managed memory.
160+
psutil.available is intentionally excluded — it reflects OS page-cache
161+
state and is blind to MLX wired buffers holding model weights.
162+
"""
163+
return int(metal_limit * fraction) - model_memory - overhead
164+
150165
def _setup_paged_attention(self) -> None:
151166
"""Create MetalPagedKVCache and patch model attention for native Metal kernel.
152167
153-
Computes num_blocks from available system RAM, model weight size, and
168+
Computes num_blocks from Metal memory headroom, model weight size, and
154169
a configurable memory fraction, rather than blindly scaling from
155170
max_model_len.
156171
"""
157-
import psutil
158-
159172
from vllm_metal.metal_kernel_backend.cache import MetalPagedKVCache
160173
from vllm_metal.metal_kernel_backend.paged_attention import (
161174
patch_model_attention_metal_kernel,
@@ -175,42 +188,39 @@ def _setup_paged_attention(self) -> None:
175188
else:
176189
fraction = self.metal_config.memory_fraction
177190

178-
# --- Gather memory numbers ---
179-
total_ram = psutil.virtual_memory().total
191+
# --- Gather Metal memory numbers ---
192+
# KV cache lives in Metal-managed (wired) memory. psutil.available
193+
# reflects OS page-cache state and excludes MLX wired buffers, making
194+
# it appear nearly zero when a large model is loaded. Use
195+
# max_recommended_working_set_size — the OS-reported Metal headroom —
196+
# as the budget ceiling instead.
197+
device_info = mx.device_info()
198+
metal_limit = int(device_info.get("max_recommended_working_set_size", 0))
199+
if metal_limit <= 0:
200+
raise RuntimeError(
201+
"Paged attention: mx.device_info() did not return "
202+
"max_recommended_working_set_size. "
203+
"Ensure MLX is up to date and running on Apple Silicon. "
204+
f"Reported device_info keys: {list(device_info.keys())}"
205+
)
180206
model_memory = self._get_model_memory_usage()
181207
per_block_bytes = self.get_cache_block_size_bytes()
182208

183209
# --- Compute KV budget ---
184-
usable_ram = int(total_ram * fraction)
185-
available_ram = psutil.virtual_memory().available
186-
187-
if usable_ram > available_ram:
188-
raise ValueError(
189-
"Paged attention: requested memory exceeds available RAM. "
190-
f"total_ram={total_ram / 1e9:.2f}GB, "
191-
f"fraction={fraction}, "
192-
f"usable_ram={usable_ram / 1e9:.2f}GB, "
193-
f"available_ram={available_ram / 1e9:.2f}GB. "
194-
"The OS and other processes are using "
195-
f"{(total_ram - available_ram) / 1e9:.2f}GB. "
196-
"Mitigations: lower VLLM_METAL_MEMORY_FRACTION "
197-
f"(try {available_ram / total_ram:.2f} or less), "
198-
"close other applications, or add more RAM."
199-
)
200-
201-
kv_budget = usable_ram - model_memory - PAGED_ATTENTION_OVERHEAD_BYTES
210+
usable_metal = int(metal_limit * fraction)
211+
kv_budget = self._kv_budget_bytes(metal_limit, model_memory, fraction)
202212

203213
if kv_budget <= 0:
204214
raise ValueError(
205-
"Paged attention: not enough memory for KV cache. "
206-
f"total_ram={total_ram / 1e9:.2f}GB, "
215+
"Paged attention: not enough Metal memory for KV cache. "
216+
f"metal_limit={metal_limit / 1e9:.2f}GB, "
207217
f"fraction={fraction}, "
208-
f"usable_ram={usable_ram / 1e9:.2f}GB, "
218+
f"usable_metal={usable_metal / 1e9:.2f}GB, "
209219
f"model_memory={model_memory / 1e9:.2f}GB, "
210220
f"overhead={PAGED_ATTENTION_OVERHEAD_BYTES / 1e9:.2f}GB, "
211221
f"kv_budget={kv_budget / 1e9:.2f}GB. "
212222
"Mitigations: increase VLLM_METAL_MEMORY_FRACTION, "
213-
"use a smaller model, or add more RAM."
223+
"use a smaller or more quantized model."
214224
)
215225

216226
num_blocks = kv_budget // per_block_bytes
@@ -219,28 +229,28 @@ def _setup_paged_attention(self) -> None:
219229
raise ValueError(
220230
"Paged attention: computed num_blocks too low "
221231
f"({num_blocks} < minimum {PAGED_ATTENTION_MIN_BLOCKS}). "
222-
f"total_ram={total_ram / 1e9:.2f}GB, "
232+
f"metal_limit={metal_limit / 1e9:.2f}GB, "
223233
f"fraction={fraction}, "
224-
f"usable_ram={usable_ram / 1e9:.2f}GB, "
234+
f"usable_metal={usable_metal / 1e9:.2f}GB, "
225235
f"model_memory={model_memory / 1e9:.2f}GB, "
226236
f"overhead={PAGED_ATTENTION_OVERHEAD_BYTES / 1e9:.2f}GB, "
227237
f"kv_budget={kv_budget / 1e9:.2f}GB, "
228238
f"per_block_bytes={per_block_bytes}. "
229239
"Mitigations: increase VLLM_METAL_MEMORY_FRACTION, "
230-
"use a smaller model, or add more RAM."
240+
"use a smaller or more quantized model."
231241
)
232242

233243
max_tokens_cached = num_blocks * block_size
234244

235245
logger.info(
236246
"Paged attention memory breakdown: "
237-
"total_ram=%.2fGB, fraction=%.2f, usable_ram=%.2fGB, "
247+
"metal_limit=%.2fGB, fraction=%.2f, usable_metal=%.2fGB, "
238248
"model_memory=%.2fGB, overhead=%.2fGB, "
239249
"kv_budget=%.2fGB, per_block_bytes=%d, "
240250
"num_blocks=%d, max_tokens_cached=%d",
241-
total_ram / 1e9,
251+
metal_limit / 1e9,
242252
fraction,
243-
usable_ram / 1e9,
253+
usable_metal / 1e9,
244254
model_memory / 1e9,
245255
PAGED_ATTENTION_OVERHEAD_BYTES / 1e9,
246256
kv_budget / 1e9,

0 commit comments

Comments
 (0)