Fix/paged memory check psutil (#197)

LxYuan0420 · web-flow · commit 4f02d3eac8e0 · 2026-03-22T15:15:01.000Z
This PR is:
- To replace psutil.virtual_memory() with
mx.device_info()["max_recommended_working_set_size"] as the KV cache
budget ceiling
- To extract _kv_budget_bytes as a testable static method with unit
tests covering normal, negative, zero boundary, and real-world model
scenarios
- To validate VLLM_METAL_MEMORY_FRACTION range at config construction
and fail fast when Metal working set size is unavailable

Note: Found it while testing paged attention with GLM-4.7-Flash-4bit. No
fraction value could satisfy the old check so traced to psutil.available
being blind to MLX wired memory. On an M2 Max with the model loaded,
psutil reports ~2.2 GB free while Metal has ~20 GB of headroom.

---------

Signed-off-by: Yuan Lik Xun &lt;lxyuan0420@gmail.com&gt;
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -136,3 +136,26 @@ def test_block_size_must_be_positive(self) -> None:
             os.environ["VLLM_METAL_BLOCK_SIZE"] = value
             with pytest.raises(ValueError, match="Invalid VLLM_METAL_BLOCK_SIZE"):
                 MetalConfig.from_env()
+
+    def test_fraction_above_one_rejected(self) -> None:
+        with pytest.raises(ValueError, match="Invalid VLLM_METAL_MEMORY_FRACTION"):
+            MetalConfig(
+                memory_fraction=1.5,
+                use_mlx=False,
+                mlx_device="gpu",
+                block_size=16,
+                debug=False,
+                use_paged_attention=True,
+            )
+
+    def test_fraction_zero_or_negative_rejected(self) -> None:
+        for fraction in [0.0, -0.1]:
+            with pytest.raises(ValueError, match="Invalid VLLM_METAL_MEMORY_FRACTION"):
+                MetalConfig(
+                    memory_fraction=fraction,
+                    use_mlx=False,
+                    mlx_device="gpu",
+                    block_size=16,
+                    debug=False,
+                    use_paged_attention=True,
+                )
diff --git a/tests/test_platform.py b/tests/test_platform.py
@@ -9,7 +9,9 @@
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.attention.selector import AttentionSelectorConfig
 
+from vllm_metal.config import PAGED_ATTENTION_OVERHEAD_BYTES
 from vllm_metal.platform import MetalPlatform
+from vllm_metal.v1.worker import MetalWorker
 
 
 class TestMetalPlatform:
@@ -292,3 +294,62 @@ def fake_eval(_value: object) -> None:
 
         MetalPlatform.synchronize()
         assert called is True
+
+
+class TestKvBudgetBytes:
+    """Tests for MetalWorker._kv_budget_bytes.
+
+    Numbers mirror a real M2 Max with GLM-4.7-Flash-4bit loaded:
+      metal_limit = 22.9 GB (max_recommended_working_set_size)
+      model_memory = 16.85 GB (mx.get_active_memory() after load)
+    """
+
+    _METAL_LIMIT = int(22.9e9)
+    _MODEL_MEM = int(16.85e9)
+
+    def test_normal_case(self) -> None:
+        budget = MetalWorker._kv_budget_bytes(
+            self._METAL_LIMIT, self._MODEL_MEM, fraction=0.9
+        )
+
+        assert (
+            budget
+            == int(self._METAL_LIMIT * 0.9)
+            - self._MODEL_MEM
+            - PAGED_ATTENTION_OVERHEAD_BYTES
+        )
+        assert budget > 0
+
+    def test_fraction_too_low_yields_negative_budget(self) -> None:
+        # fraction=0.3 → usable=6.9 GB < model(16.85 GB) → negative
+        budget = MetalWorker._kv_budget_bytes(
+            self._METAL_LIMIT, self._MODEL_MEM, fraction=0.3
+        )
+
+        assert budget < 0
+
+    def test_boundary_zero(self) -> None:
+        # Craft inputs so budget lands exactly at zero.
+        limit = self._MODEL_MEM + PAGED_ATTENTION_OVERHEAD_BYTES
+
+        budget = MetalWorker._kv_budget_bytes(limit, self._MODEL_MEM, fraction=1.0)
+
+        assert budget == 0
+
+    def test_custom_overhead(self) -> None:
+        budget_zero_overhead = MetalWorker._kv_budget_bytes(
+            self._METAL_LIMIT, self._MODEL_MEM, fraction=0.9, overhead=0
+        )
+        budget_default = MetalWorker._kv_budget_bytes(
+            self._METAL_LIMIT, self._MODEL_MEM, fraction=0.9
+        )
+
+        assert budget_zero_overhead - budget_default == PAGED_ATTENTION_OVERHEAD_BYTES
+
+    def test_large_model_has_positive_budget_at_default_fraction(self) -> None:
+        # GLM-4.7-Flash-4bit at fraction=0.9 must yield > 1 GB for KV cache.
+        budget = MetalWorker._kv_budget_bytes(
+            self._METAL_LIMIT, self._MODEL_MEM, fraction=0.9
+        )
+
+        assert budget > 1e9
diff --git a/vllm_metal/config.py b/vllm_metal/config.py
@@ -47,6 +47,13 @@ def __post_init__(self) -> None:
                 "The MLX path must use VLLM_METAL_MEMORY_FRACTION=auto."
             )
 
+        if self.use_paged_attention and not self.is_auto_memory:
+            if not (0 < self.memory_fraction <= 1):
+                raise ValueError(
+                    f"Invalid VLLM_METAL_MEMORY_FRACTION={self.memory_fraction}. "
+                    "Must be a finite value in (0, 1] when paged attention is enabled."
+                )
+
     @property
     def is_auto_memory(self) -> bool:
         """Check if memory fraction is set to auto mode."""
diff --git a/vllm_metal/v1/worker.py b/vllm_metal/v1/worker.py
@@ -147,15 +147,28 @@ def load_model(self) -> None:
         ):
             self._setup_paged_attention()
 
+    @staticmethod
+    def _kv_budget_bytes(
+        metal_limit: int,
+        model_memory: int,
+        fraction: float,
+        overhead: int = PAGED_ATTENTION_OVERHEAD_BYTES,
+    ) -> int:
+        """KV cache budget = fraction of Metal limit minus model and overhead.
+
+        All three quantities live in the same domain: Metal-managed memory.
+        psutil.available is intentionally excluded — it reflects OS page-cache
+        state and is blind to MLX wired buffers holding model weights.
+        """
+        return int(metal_limit * fraction) - model_memory - overhead
+
     def _setup_paged_attention(self) -> None:
         """Create MetalPagedKVCache and patch model attention for native Metal kernel.
 
-        Computes num_blocks from available system RAM, model weight size, and
+        Computes num_blocks from Metal memory headroom, model weight size, and
         a configurable memory fraction, rather than blindly scaling from
         max_model_len.
         """
-        import psutil
-
         from vllm_metal.metal_kernel_backend.cache import MetalPagedKVCache
         from vllm_metal.metal_kernel_backend.paged_attention import (
             patch_model_attention_metal_kernel,
@@ -175,42 +188,39 @@ def _setup_paged_attention(self) -> None:
         else:
             fraction = self.metal_config.memory_fraction
 
-        # --- Gather memory numbers ---
-        total_ram = psutil.virtual_memory().total
+        # --- Gather Metal memory numbers ---
+        # KV cache lives in Metal-managed (wired) memory. psutil.available
+        # reflects OS page-cache state and excludes MLX wired buffers, making
+        # it appear nearly zero when a large model is loaded. Use
+        # max_recommended_working_set_size — the OS-reported Metal headroom —
+        # as the budget ceiling instead.
+        device_info = mx.device_info()
+        metal_limit = int(device_info.get("max_recommended_working_set_size", 0))
+        if metal_limit <= 0:
+            raise RuntimeError(
+                "Paged attention: mx.device_info() did not return "
+                "max_recommended_working_set_size. "
+                "Ensure MLX is up to date and running on Apple Silicon. "
+                f"Reported device_info keys: {list(device_info.keys())}"
+            )
         model_memory = self._get_model_memory_usage()
         per_block_bytes = self.get_cache_block_size_bytes()
 
         # --- Compute KV budget ---
-        usable_ram = int(total_ram * fraction)
-        available_ram = psutil.virtual_memory().available
-
-        if usable_ram > available_ram:
-            raise ValueError(
-                "Paged attention: requested memory exceeds available RAM. "
-                f"total_ram={total_ram / 1e9:.2f}GB, "
-                f"fraction={fraction}, "
-                f"usable_ram={usable_ram / 1e9:.2f}GB, "
-                f"available_ram={available_ram / 1e9:.2f}GB. "
-                "The OS and other processes are using "
-                f"{(total_ram - available_ram) / 1e9:.2f}GB. "
-                "Mitigations: lower VLLM_METAL_MEMORY_FRACTION "
-                f"(try {available_ram / total_ram:.2f} or less), "
-                "close other applications, or add more RAM."
-            )
-
-        kv_budget = usable_ram - model_memory - PAGED_ATTENTION_OVERHEAD_BYTES
+        usable_metal = int(metal_limit * fraction)
+        kv_budget = self._kv_budget_bytes(metal_limit, model_memory, fraction)
 
         if kv_budget <= 0:
             raise ValueError(
-                "Paged attention: not enough memory for KV cache. "
-                f"total_ram={total_ram / 1e9:.2f}GB, "
+                "Paged attention: not enough Metal memory for KV cache. "
+                f"metal_limit={metal_limit / 1e9:.2f}GB, "
                 f"fraction={fraction}, "
-                f"usable_ram={usable_ram / 1e9:.2f}GB, "
+                f"usable_metal={usable_metal / 1e9:.2f}GB, "
                 f"model_memory={model_memory / 1e9:.2f}GB, "
                 f"overhead={PAGED_ATTENTION_OVERHEAD_BYTES / 1e9:.2f}GB, "
                 f"kv_budget={kv_budget / 1e9:.2f}GB. "
                 "Mitigations: increase VLLM_METAL_MEMORY_FRACTION, "
-                "use a smaller model, or add more RAM."
+                "use a smaller or more quantized model."
             )
 
         num_blocks = kv_budget // per_block_bytes
@@ -219,28 +229,28 @@ def _setup_paged_attention(self) -> None:
             raise ValueError(
                 "Paged attention: computed num_blocks too low "
                 f"({num_blocks} < minimum {PAGED_ATTENTION_MIN_BLOCKS}). "
-                f"total_ram={total_ram / 1e9:.2f}GB, "
+                f"metal_limit={metal_limit / 1e9:.2f}GB, "
                 f"fraction={fraction}, "
-                f"usable_ram={usable_ram / 1e9:.2f}GB, "
+                f"usable_metal={usable_metal / 1e9:.2f}GB, "
                 f"model_memory={model_memory / 1e9:.2f}GB, "
                 f"overhead={PAGED_ATTENTION_OVERHEAD_BYTES / 1e9:.2f}GB, "
                 f"kv_budget={kv_budget / 1e9:.2f}GB, "
                 f"per_block_bytes={per_block_bytes}. "
                 "Mitigations: increase VLLM_METAL_MEMORY_FRACTION, "
-                "use a smaller model, or add more RAM."
+                "use a smaller or more quantized model."
             )
 
         max_tokens_cached = num_blocks * block_size
 
         logger.info(
             "Paged attention memory breakdown: "
-            "total_ram=%.2fGB, fraction=%.2f, usable_ram=%.2fGB, "
+            "metal_limit=%.2fGB, fraction=%.2f, usable_metal=%.2fGB, "
             "model_memory=%.2fGB, overhead=%.2fGB, "
             "kv_budget=%.2fGB, per_block_bytes=%d, "
             "num_blocks=%d, max_tokens_cached=%d",
-            total_ram / 1e9,
+            metal_limit / 1e9,
             fraction,
-            usable_ram / 1e9,
+            usable_metal / 1e9,
             model_memory / 1e9,
             PAGED_ATTENTION_OVERHEAD_BYTES / 1e9,
             kv_budget / 1e9,