fix: set block_size to 1

gufengc · gufengc · commit b31cb9f9dfbe · 2026-06-02T12:27:34.000+08:00
diff --git a/src/parallax/server/block_radix_cache.py b/src/parallax/server/block_radix_cache.py
@@ -164,11 +164,6 @@ def insert_block(
 
         self.num_cached_blocks += 1
 
-        logger.debug(
-            f"Inserted new block: block_id={block_id}, "
-            f"tokens={token_ids[:5]}..., total_cached={self.num_cached_blocks}"
-        )
-
         if self.num_cached_blocks > self.max_cached_blocks:
             self._evict_lru_blocks(self.num_cached_blocks - self.max_cached_blocks)
 
@@ -190,11 +185,6 @@ def decrease_lock_ref(self, nodes: List[BlockTreeNode]):
             if node.lock_ref > 0:
                 node.lock_ref -= 1
 
-            if node.lock_ref == 0:
-                logger.debug(
-                    f"Node {node.node_id} (block_id={node.block_id}) ref count = 0, evictable"
-                )
-
     def register_request(self, request_id: str, nodes: List[BlockTreeNode]):
         """Register nodes used by request."""
         self.request_to_nodes[request_id] = nodes
diff --git a/src/parallax/server/cache_manager.py b/src/parallax/server/cache_manager.py
@@ -43,7 +43,7 @@ def __init__(
         linear_num_v_heads: Optional[int] = None,
         # Prefix Cache Config
         enable_prefix_cache: bool = False,
-        max_cached_blocks: int = 1000,
+        max_cached_blocks: Optional[int] = None,
         sliding_window: Optional[int] = None,
     ):
         self.num_layers = num_layers
@@ -83,6 +83,9 @@ def __init__(
             num_gpu_blocks = 0
 
         self.num_gpu_blocks = num_gpu_blocks
+        self.max_cached_blocks = (
+            self.num_gpu_blocks if max_cached_blocks is None else max_cached_blocks
+        )
 
         # 1. Initialize Allocators
         self.allocator = (
@@ -121,10 +124,10 @@ def __init__(
         if enable_prefix_cache and self.needs_blocks:
             self.prefix_cache = BlockRadixCache(
                 block_size=block_size,
-                max_cached_blocks=max_cached_blocks,
+                max_cached_blocks=self.max_cached_blocks,
                 on_block_evict=self._on_prefix_block_evict,
             )
-            logger.info(f"Prefix cache enabled with max_cached_blocks={max_cached_blocks}")
+            logger.info(f"Prefix cache enabled with max_cached_blocks={self.max_cached_blocks}")
 
         # Mapping: request_id -> token_ids (for prefix matching)
         self.request_token_ids: Dict[str, List[int]] = {}
@@ -548,11 +551,6 @@ def insert_full_blocks_to_cache(self, request_id: str):
             parent_path.append(new_node)
             registered_nodes.append(new_node)
 
-            logger.debug(
-                f"Request {request_id}: Inserted block {block_idx} "
-                f"(block_id={block_id}) to prefix cache"
-            )
-
         if registered_nodes:
             if request_id in self.prefix_cache.request_to_nodes:
                 old_nodes = self.prefix_cache.request_to_nodes[request_id]
diff --git a/src/parallax/server/executor/mlx_executor.py b/src/parallax/server/executor/mlx_executor.py
@@ -178,7 +178,7 @@ def __init__(
             sliding_window = None
 
         # Validate and adjust block size for Metal backend
-        supported_block_sizes = [8, 16, 32, 64]
+        supported_block_sizes = [1, 8, 16, 32, 64]
         if kv_block_size not in supported_block_sizes:
             nearest_block_size = min(supported_block_sizes, key=lambda x: abs(x - kv_block_size))
             logger.warning(
diff --git a/src/parallax/server/server_args.py b/src/parallax/server/server_args.py
@@ -102,7 +102,7 @@ def parse_args() -> argparse.Namespace:
     )
 
     parser.add_argument(
-        "--kv-block-size", type=int, default=32, help="Block size for KV cache management"
+        "--kv-block-size", type=int, default=1, help="Block size for KV cache management"
     )
 
     parser.add_argument(
diff --git a/src/parallax_extensions/kernels/paged_attention.metal b/src/parallax_extensions/kernels/paged_attention.metal
@@ -1456,6 +1456,8 @@ template <typename T, int HEAD_SIZE, int NUM_THREADS, int NUM_SIMD_LANES,
 
 #define instantiate_paged_attention_block_size(type, cache_type, num_threads,  \
                                                num_simd_lanes, partition_size) \
+  instantiate_paged_attention_heads(type, cache_type, 1, num_threads,          \
+                                    num_simd_lanes, partition_size);           \
   instantiate_paged_attention_heads(type, cache_type, 8, num_threads,          \
                                     num_simd_lanes, partition_size);           \
   instantiate_paged_attention_heads(type, cache_type, 16, num_threads,         \
diff --git a/src/parallax_extensions/lib/parallax_ext.metallib b/src/parallax_extensions/lib/parallax_ext.metallib

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ def parse_args() -> argparse.Namespace:`
`102`	`102`	`)`
`103`	`103`
`104`	`104`	`parser.add_argument(`
`105`		`- "--kv-block-size", type=int, default=32, help="Block size for KV cache management"`
	`105`	`+ "--kv-block-size", type=int, default=1, help="Block size for KV cache management"`
`106`	`106`	`)`
`107`	`107`
`108`	`108`	`parser.add_argument(`