vllm-project
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎scripts/lib.sh‎
Lines changed: 1 addition & 1 deletion b/‎scripts/lib.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lib.rs‎
Lines changed: 9 additions & 9 deletions b/‎src/lib.rs‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎tests/test_kernel_loader.py‎
Lines changed: 128 additions & 0 deletions b/‎tests/test_kernel_loader.py‎
Lines changed: 128 additions & 0 deletions
@@ -9,7 +9,7 @@ vLLM Metal is a plugin that enables vLLM to run on Apple Silicon Macs using MLX
 - **MLX-accelerated inference**: faster than PyTorch MPS on Apple Silicon
 - **Unified memory**: True zero-copy operations leveraging Apple Silicon's unified memory architecture
 - **vLLM compatibility**: Full integration with vLLM's engine, scheduler, and OpenAI-compatible API
-- **Paged attention**: Efficient KV cache management for long sequences
+- **Paged attention** *(experimental)*: Efficient KV cache management for long sequences — opt-in via `VLLM_METAL_USE_PAGED_ATTENTION=1` (requires `pip install 'vllm-metal[paged]'`); default path uses MLX-managed KV cache
 - **GQA support**: Grouped-Query Attention for efficient inference
 
 ## Requirements
@@ -78,6 +78,7 @@ Environment variables for customization:
 | `VLLM_METAL_USE_MLX` | `1` | Use MLX for compute (1=yes, 0=no) |
 | `VLLM_MLX_DEVICE` | `gpu` | MLX device (`gpu` or `cpu`) |
 | `VLLM_METAL_BLOCK_SIZE` | `16` | KV cache block size |
+| `VLLM_METAL_USE_PAGED_ATTENTION` | `0` | Enable experimental paged KV cache (requires `pip install 'vllm-metal[paged]'`) |
 | `VLLM_METAL_DEBUG` | `0` | Enable debug logging |
 | `VLLM_USE_MODELSCOPE` | `False` | Set True to change model registry to <https://www.modelscope.cn/> |
 | `VLLM_METAL_MODELSCOPE_CACHE` | None | Specify the absolute path of the local model |
 
@@ -41,6 +41,10 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+paged = [
+    # Paged attention Metal kernel (opt-in, experimental)
+    "kernels>=0.4.5; platform_system == 'Darwin' and platform_machine == 'arm64'",
+]
 vllm = ["vllm>=0.14.0"]
 stt = [
     # Speech-to-text audio processing (Whisper models)
@@ -54,7 +58,7 @@ dev = [
     "mypy>=1.19.1",
 ]
 all = [
-    "vllm-metal[vllm,stt,dev]",
+    "vllm-metal[vllm,paged,stt,dev]",
 ]
 
 [project.urls]
 
@@ -49,7 +49,7 @@ ensure_venv() {
 # Install dev dependencies
 install_dev_deps() {
   section "Installing dependencies"
-  uv pip install -e ".[dev]"
+  uv pip install -e ".[dev,paged]"
 }
 
 # Full development environment setup
 
@@ -21,7 +21,7 @@ pub struct BlockAllocator {
     /// Free blocks stored in a deque for O(1) operations
     free_blocks: VecDeque<usize>,
     /// Mapping from sequence ID to allocated blocks
-    sequence_blocks: HashMap<i64, Vec<usize>>,
+    sequence_blocks: HashMap<String, Vec<usize>>,
     /// Total number of blocks
     num_blocks: usize,
 }
@@ -53,7 +53,7 @@ impl BlockAllocator {
     ///
     /// # Raises
     /// RuntimeError if not enough free blocks
-    pub fn allocate_blocks(&mut self, seq_id: i64, num_blocks: usize) -> PyResult<Vec<usize>> {
+    pub fn allocate_blocks(&mut self, seq_id: String, num_blocks: usize) -> PyResult<Vec<usize>> {
         if self.free_blocks.len() < num_blocks {
             return Err(pyo3::exceptions::PyRuntimeError::new_err(format!(
                 "Not enough free blocks: need {}, have {}",
@@ -83,7 +83,7 @@ impl BlockAllocator {
     ///
     /// # Arguments
     /// * `seq_id` - Sequence identifier
-    pub fn free_sequence(&mut self, seq_id: i64) {
+    pub fn free_sequence(&mut self, seq_id: String) {
         if let Some(blocks) = self.sequence_blocks.remove(&seq_id) {
             // Return blocks to the free pool
             for block_idx in blocks {
@@ -99,7 +99,7 @@ impl BlockAllocator {
     ///
     /// # Returns
     /// List of block indices for the sequence
-    pub fn get_sequence_blocks(&self, seq_id: i64) -> Vec<usize> {
+    pub fn get_sequence_blocks(&self, seq_id: String) -> Vec<usize> {
         self.sequence_blocks
             .get(&seq_id)
             .cloned()
@@ -119,7 +119,7 @@ impl BlockAllocator {
     }
 
     /// Check if sequence has blocks allocated.
-    pub fn has_sequence(&self, seq_id: i64) -> bool {
+    pub fn has_sequence(&self, seq_id: String) -> bool {
         self.sequence_blocks.contains_key(&seq_id)
     }
 
@@ -130,7 +130,7 @@ impl BlockAllocator {
     }
 
     /// Get all sequence blocks as a dictionary.
-    pub fn get_all_sequence_blocks(&self) -> HashMap<i64, Vec<usize>> {
+    pub fn get_all_sequence_blocks(&self) -> HashMap<String, Vec<usize>> {
         self.sequence_blocks.clone()
     }
 }
@@ -280,10 +280,10 @@ pub fn compute_kv_block_indices(
 /// Batch compute block indices for multiple sequences.
 #[pyfunction]
 pub fn batch_compute_kv_indices(
-    sequence_blocks: HashMap<i64, Vec<usize>>,
-    seq_lens: HashMap<i64, usize>,
+    sequence_blocks: HashMap<String, Vec<usize>>,
+    seq_lens: HashMap<String, usize>,
     block_size: usize,
-) -> HashMap<i64, Vec<(usize, usize, usize)>> {
+) -> HashMap<String, Vec<(usize, usize, usize)>> {
     let mut result = HashMap::with_capacity(sequence_blocks.len());
 
     for (seq_id, blocks) in sequence_blocks {
 
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for kernel_loader: OS-aware revision pinning for Metal compatibility.
+
+Verifies that:
+- macOS 16+ uses the latest HF kernel (default revision)
+- macOS 15 and earlier pins to the Nov 2025 compat revision (Metal 3.2)
+- Both revisions actually load and expose the expected ops
+
+Run with:
+    python -m pytest tests/test_kernel_loader.py -v -s
+"""
+
+from __future__ import annotations
+
+from unittest import mock
+
+import pytest
+
+pytest.importorskip("kernels")
+
+# ---------------------------------------------------------------------------
+# Unit tests (no network, no GPU)
+# ---------------------------------------------------------------------------
+
+
+class TestNeedsCompatRevision:
+    """Test _needs_compat_revision() with mocked macOS versions."""
+
+    @pytest.mark.parametrize(
+        "ver, expected",
+        [
+            ("15.7.4", True),  # macOS 15 — needs compat
+            ("14.5", True),  # macOS 14 — needs compat
+            ("26.3", False),  # macOS 26 — modern
+            ("", False),  # empty — safe default
+        ],
+    )
+    def test_version_check(self, ver, expected):
+        from vllm_metal.metal_kernel_backend.kernel_loader import _needs_compat_revision
+
+        with mock.patch("platform.mac_ver", return_value=(ver, ("", "", ""), "")):
+            assert _needs_compat_revision() is expected
+
+
+class TestGetKernelRevisionSelection:
+    """Test that get_paged_attention_ops passes the right revision to get_kernel."""
+
+    def _reset_kernel_cache(self):
+        import vllm_metal.metal_kernel_backend.kernel_loader as kl
+
+        kl._kernel = None
+
+    def test_macos_15_uses_compat_revision(self):
+        self._reset_kernel_cache()
+        with (
+            mock.patch("platform.mac_ver", return_value=("15.7.4", ("", "", ""), "")),
+            mock.patch("kernels.get_kernel", return_value=mock.MagicMock()) as mk,
+        ):
+            from vllm_metal.metal_kernel_backend.kernel_loader import (
+                _MACOS15_COMPAT_REVISION,
+                get_paged_attention_ops,
+            )
+
+            get_paged_attention_ops()
+            mk.assert_called_once_with(
+                "kernels-community/paged-attention",
+                revision=_MACOS15_COMPAT_REVISION,
+            )
+        self._reset_kernel_cache()
+
+    def test_macos_26_uses_latest(self):
+        self._reset_kernel_cache()
+        with (
+            mock.patch("platform.mac_ver", return_value=("26.3", ("", "", ""), "")),
+            mock.patch("kernels.get_kernel", return_value=mock.MagicMock()) as mk,
+        ):
+            from vllm_metal.metal_kernel_backend.kernel_loader import (
+                get_paged_attention_ops,
+            )
+
+            get_paged_attention_ops()
+            mk.assert_called_once_with(
+                "kernels-community/paged-attention",
+                revision=None,
+            )
+        self._reset_kernel_cache()
+
+
+# ---------------------------------------------------------------------------
+# Integration tests (require network + MPS)
+# ---------------------------------------------------------------------------
+
+
+def _mps_available() -> bool:
+    try:
+        import torch
+
+        return torch.backends.mps.is_available()
+    except Exception:
+        return False
+
+
+@pytest.mark.skipif(not _mps_available(), reason="MPS not available")
+class TestKernelLoadsForReal:
+    """Actually load the kernel from HuggingFace and verify ops exist."""
+
+    _EXPECTED_OPS = {"reshape_and_cache", "paged_attention_v1"}
+
+    def test_latest_revision_loads(self):
+        from kernels import get_kernel
+
+        kernel = get_kernel("kernels-community/paged-attention")
+        ops = set(dir(kernel))
+        assert self._EXPECTED_OPS <= ops, f"Missing ops: {self._EXPECTED_OPS - ops}"
+
+    def test_compat_revision_loads(self):
+        from kernels import get_kernel
+
+        from vllm_metal.metal_kernel_backend.kernel_loader import (
+            _MACOS15_COMPAT_REVISION,
+        )
+
+        kernel = get_kernel(
+            "kernels-community/paged-attention",
+            revision=_MACOS15_COMPAT_REVISION,
+        )
+        ops = set(dir(kernel))
+        assert self._EXPECTED_OPS <= ops, f"Missing ops: {self._EXPECTED_OPS - ops}"
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,10 @@ dependencies = [`
`41`	`41`	`]`
`42`	`42`
`43`	`43`	`[project.optional-dependencies]`
	`44`	`+paged = [`
	`45`	`+ # Paged attention Metal kernel (opt-in, experimental)`
	`46`	`+ "kernels>=0.4.5; platform_system == 'Darwin' and platform_machine == 'arm64'",`
	`47`	`+]`
`44`	`48`	`vllm = ["vllm>=0.14.0"]`
`45`	`49`	`stt = [`
`46`	`50`	`# Speech-to-text audio processing (Whisper models)`
`@@ -54,7 +58,7 @@ dev = [`
`54`	`58`	`"mypy>=1.19.1",`
`55`	`59`	`]`
`56`	`60`	`all = [`
`57`		`- "vllm-metal[vllm,stt,dev]",`
	`61`	`+ "vllm-metal[vllm,paged,stt,dev]",`
`58`	`62`	`]`
`59`	`63`
`60`	`64`	`[project.urls]`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ ensure_venv() {`
`49`	`49`	`# Install dev dependencies`
`50`	`50`	`install_dev_deps() {`
`51`	`51`	`section "Installing dependencies"`
`52`		`- uv pip install -e ".[dev]"`
	`52`	`+ uv pip install -e ".[dev,paged]"`
`53`	`53`	`}`
`54`	`54`
`55`	`55`	`# Full development environment setup`