vllm-project
diff --git a/‎README.md‎
Lines changed: 27 additions & 27 deletions b/‎README.md‎
Lines changed: 27 additions & 27 deletions
diff --git a/‎tests/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/test_cache.py‎
Lines changed: 208 additions & 0 deletions b/‎tests/test_cache.py‎
Lines changed: 208 additions & 0 deletions
diff --git a/‎tests/test_config.py‎
Lines changed: 78 additions & 0 deletions b/‎tests/test_config.py‎
Lines changed: 78 additions & 0 deletions
@@ -27,39 +27,39 @@ vLLM Metal is a plugin that enables vLLM to run on Apple Silicon Macs using MLX
 ## Architecture
 
 ```
-┌────────────────────────────────────────────────────────────┐
-│                    vLLM Core (Unchanged)                   │
-│         Engine, Scheduler, API Server, Tokenizers          │
-└────────────────────────────────────────────────────────────┘
+┌──────────────────────────────────────────────────────────┐
+│                    vLLM Core (Unchanged)                 │
+│         Engine, Scheduler, API Server, Tokenizers        │
+└──────────────────────────────────────────────────────────┘
                               │
                               ▼
-┌────────────────────────────────────────────────────────────┐
-│                 vllm_metal Plugin Layer                    │
-│  ┌─────────────┐  ┌─────────────┐  ┌─────────────────────┐ │
-│  │MetalPlatform│  │ MetalWorker │  │ MetalModelRunner    │ │
-│  │ (Platform)  │  │ (Worker)    │  │ (ModelRunner)       │ │
-│  └─────────────┘  └─────────────┘  └─────────────────────┘ │
-└────────────────────────────────────────────────────────────┘
+┌──────────────────────────────────────────────────────────┐
+│                 vllm_metal Plugin Layer                  │
+│  ┌─────────────┐  ┌─────────────┐  ┌───────────────────┐ │
+│  │MetalPlatform│  │ MetalWorker │  │ MetalModelRunner  │ │
+│  │ (Platform)  │  │ (Worker)    │  │ (ModelRunner)     │ │
+│  └─────────────┘  └─────────────┘  └───────────────────┘ │
+└──────────────────────────────────────────────────────────┘
                               │
                               ▼
-┌────────────────────────────────────────────────────────────┐
-│              Unified Compute Backend                       │
-│  ┌──────────────────────┐  ┌─────────────────────────────┐ │
-│  │   MLX Backend        │  │   PyTorch Backend           │ │
-│  │   (Primary)          │  │   (Model Loading/Interop)   │ │
-│  │                      │  │                             │ │
-│  │ • SDPA Attention     │  │ • HuggingFace Loading       │ │
-│  │ • RMSNorm            │  │ • Weight Conversion         │ │
-│  │ • RoPE               │  │ • Tensor Bridge             │ │
-│  │ • Cache Ops          │  │                             │ │
-│  └──────────────────────┘  └─────────────────────────────┘ │
-└────────────────────────────────────────────────────────────┘
+┌──────────────────────────────────────────────────────────┐
+│              Unified Compute Backend                     │
+│  ┌──────────────────────┐  ┌───────────────────────────┐ │
+│  │   MLX Backend        │  │   PyTorch Backend         │ │
+│  │   (Primary)          │  │   (Model Loading/Interop) │ │
+│  │                      │  │                           │ │
+│  │ • SDPA Attention     │  │ • HuggingFace Loading     │ │
+│  │ • RMSNorm            │  │ • Weight Conversion       │ │
+│  │ • RoPE               │  │ • Tensor Bridge           │ │
+│  │ • Cache Ops          │  │                           │ │
+│  └──────────────────────┘  └───────────────────────────┘ │
+└──────────────────────────────────────────────────────────┘
                               │
                               ▼
-┌────────────────────────────────────────────────────────────┐
-│                    Metal GPU Layer                         │
-│         Apple Silicon Unified Memory Architecture          │
-└────────────────────────────────────────────────────────────┘
+┌──────────────────────────────────────────────────────────┐
+│                    Metal GPU Layer                       │
+│         Apple Silicon Unified Memory Architecture        │
+└──────────────────────────────────────────────────────────┘
 ```
 
 ## Configuration
 
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for vLLM Metal plugin."""
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for KV cache implementations."""
+
+import mlx.core as mx
+import pytest
+
+from vllm_metal.mlx_backend.cache import KVCache, PagedKVCache
+
+
+class TestKVCache:
+    """Tests for simple KV cache."""
+
+    def test_cache_initialization(self) -> None:
+        """Test cache initialization."""
+        cache = KVCache(
+            num_layers=4,
+            num_kv_heads=8,
+            head_dim=64,
+            max_seq_len=512,
+        )
+
+        assert cache.num_layers == 4
+        assert cache.num_kv_heads == 8
+        assert cache.head_dim == 64
+        assert cache.max_seq_len == 512
+        assert cache.seq_len == 0
+
+    def test_cache_update(self) -> None:
+        """Test cache update and retrieval."""
+        cache = KVCache(
+            num_layers=2,
+            num_kv_heads=4,
+            head_dim=32,
+            max_seq_len=128,
+        )
+
+        # Create test tensors
+        batch = 1
+        seq_len = 4
+        key = mx.random.normal((batch, seq_len, 4, 32))
+        value = mx.random.normal((batch, seq_len, 4, 32))
+        positions = mx.arange(seq_len)[None, :]
+
+        # Update layer 0
+        cached_k, cached_v = cache.update(0, key, value, positions)
+        mx.eval(cached_k, cached_v)
+
+        assert cached_k.shape == (1, seq_len, 4, 32)
+        assert cached_v.shape == (1, seq_len, 4, 32)
+        assert cache.seq_len == seq_len
+
+    def test_cache_incremental_update(self) -> None:
+        """Test incremental cache updates."""
+        cache = KVCache(
+            num_layers=2,
+            num_kv_heads=4,
+            head_dim=32,
+            max_seq_len=128,
+        )
+
+        # First update
+        key1 = mx.random.normal((1, 4, 4, 32))
+        value1 = mx.random.normal((1, 4, 4, 32))
+        positions1 = mx.arange(4)[None, :]
+
+        cache.update(0, key1, value1, positions1)
+        assert cache.seq_len == 4
+
+        # Second update (incremental)
+        key2 = mx.random.normal((1, 1, 4, 32))
+        value2 = mx.random.normal((1, 1, 4, 32))
+        positions2 = mx.array([[4]])
+
+        cached_k, cached_v = cache.update(0, key2, value2, positions2)
+        mx.eval(cached_k, cached_v)
+
+        assert cached_k.shape == (1, 5, 4, 32)
+        assert cached_v.shape == (1, 5, 4, 32)
+        assert cache.seq_len == 5
+
+    def test_cache_reset(self) -> None:
+        """Test cache reset."""
+        cache = KVCache(
+            num_layers=2,
+            num_kv_heads=4,
+            head_dim=32,
+            max_seq_len=128,
+        )
+
+        # Add some data
+        key = mx.random.normal((1, 4, 4, 32))
+        value = mx.random.normal((1, 4, 4, 32))
+        positions = mx.arange(4)[None, :]
+        cache.update(0, key, value, positions)
+
+        assert cache.seq_len == 4
+
+        # Reset
+        cache.reset()
+
+        assert cache.seq_len == 0
+
+
+class TestPagedKVCache:
+    """Tests for paged KV cache."""
+
+    def test_paged_cache_initialization(self) -> None:
+        """Test paged cache initialization."""
+        cache = PagedKVCache(
+            num_layers=4,
+            num_kv_heads=8,
+            head_dim=64,
+            num_blocks=100,
+            block_size=16,
+        )
+
+        assert cache.num_layers == 4
+        assert cache.num_kv_heads == 8
+        assert cache.head_dim == 64
+        assert cache.num_blocks == 100
+        assert cache.block_size == 16
+        assert cache.num_free_blocks == 100
+
+    def test_block_allocation(self) -> None:
+        """Test block allocation."""
+        cache = PagedKVCache(
+            num_layers=2,
+            num_kv_heads=4,
+            head_dim=32,
+            num_blocks=10,
+            block_size=16,
+        )
+
+        # Allocate blocks for sequence 0
+        blocks = cache.allocate_blocks(seq_id=0, num_blocks=3)
+
+        assert len(blocks) == 3
+        assert cache.num_free_blocks == 7
+        assert 0 in cache.sequence_blocks
+
+    def test_block_allocation_insufficient(self) -> None:
+        """Test block allocation with insufficient blocks."""
+        cache = PagedKVCache(
+            num_layers=2,
+            num_kv_heads=4,
+            head_dim=32,
+            num_blocks=5,
+            block_size=16,
+        )
+
+        # Try to allocate more blocks than available
+        with pytest.raises(RuntimeError, match="Not enough free blocks"):
+            cache.allocate_blocks(seq_id=0, num_blocks=10)
+
+    def test_sequence_free(self) -> None:
+        """Test freeing sequence blocks."""
+        cache = PagedKVCache(
+            num_layers=2,
+            num_kv_heads=4,
+            head_dim=32,
+            num_blocks=10,
+            block_size=16,
+        )
+
+        # Allocate blocks
+        cache.allocate_blocks(seq_id=0, num_blocks=3)
+        cache.allocate_blocks(seq_id=1, num_blocks=2)
+
+        assert cache.num_free_blocks == 5
+
+        # Free sequence 0
+        cache.free_sequence(seq_id=0)
+
+        assert cache.num_free_blocks == 8
+        assert 0 not in cache.sequence_blocks
+        assert 1 in cache.sequence_blocks
+
+    def test_block_update(self) -> None:
+        """Test updating block contents."""
+        cache = PagedKVCache(
+            num_layers=2,
+            num_kv_heads=4,
+            head_dim=32,
+            num_blocks=10,
+            block_size=16,
+        )
+
+        blocks = cache.allocate_blocks(seq_id=0, num_blocks=1)
+        block_idx = blocks[0]
+
+        # Update block
+        key = mx.random.normal((8, 4, 32))
+        value = mx.random.normal((8, 4, 32))
+
+        cache.update_block(
+            block_idx=block_idx,
+            layer_idx=0,
+            key=key,
+            value=value,
+            slot_offset=0,
+        )
+
+        # Verify update
+        cached_k, cached_v = cache.get_sequence_kv(seq_id=0, layer_idx=0, seq_len=8)
+        mx.eval(cached_k, cached_v)
+
+        assert cached_k.shape == (8, 4, 32)
+        assert cached_v.shape == (8, 4, 32)
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for vLLM Metal configuration."""
+
+import os
+
+from vllm_metal.config import MetalConfig, get_config, reset_config
+
+
+class TestMetalConfig:
+    """Tests for MetalConfig class."""
+
+    def setup_method(self) -> None:
+        """Reset config before each test."""
+        reset_config()
+        # Clear environment variables
+        for var in [
+            "VLLM_METAL_MEMORY_FRACTION",
+            "VLLM_METAL_USE_MLX",
+            "VLLM_MLX_DEVICE",
+            "VLLM_METAL_BLOCK_SIZE",
+            "VLLM_METAL_DEBUG",
+        ]:
+            os.environ.pop(var, None)
+
+    def teardown_method(self) -> None:
+        """Reset config after each test."""
+        reset_config()
+        for var in [
+            "VLLM_METAL_MEMORY_FRACTION",
+            "VLLM_METAL_USE_MLX",
+            "VLLM_MLX_DEVICE",
+            "VLLM_METAL_BLOCK_SIZE",
+            "VLLM_METAL_DEBUG",
+        ]:
+            os.environ.pop(var, None)
+
+    def test_default_config(self) -> None:
+        """Test default configuration values."""
+        config = MetalConfig.from_env()
+
+        assert config.memory_fraction == 0.9
+        assert config.use_mlx is True
+        assert config.mlx_device == "gpu"
+        assert config.block_size == 16
+        assert config.debug is False
+
+    def test_custom_config_from_env(self) -> None:
+        """Test configuration from environment variables."""
+        os.environ["VLLM_METAL_MEMORY_FRACTION"] = "0.75"
+        os.environ["VLLM_METAL_USE_MLX"] = "0"
+        os.environ["VLLM_MLX_DEVICE"] = "cpu"
+        os.environ["VLLM_METAL_BLOCK_SIZE"] = "32"
+        os.environ["VLLM_METAL_DEBUG"] = "1"
+
+        config = MetalConfig.from_env()
+
+        assert config.memory_fraction == 0.75
+        assert config.use_mlx is False
+        assert config.mlx_device == "cpu"
+        assert config.block_size == 32
+        assert config.debug is True
+
+    def test_get_config_singleton(self) -> None:
+        """Test that get_config returns a singleton."""
+        config1 = get_config()
+        config2 = get_config()
+
+        assert config1 is config2
+
+    def test_reset_config(self) -> None:
+        """Test that reset_config clears the singleton."""
+        config1 = get_config()
+        reset_config()
+        config2 = get_config()
+
+        # After reset, we get a new config instance
+        # (but with same values since env vars haven't changed)
+        assert config1 is not config2
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
	`2`	`+"""Tests for vLLM Metal plugin."""`