@@ -43,7 +43,7 @@ def __init__(
4343 linear_num_v_heads : Optional [int ] = None ,
4444 # Prefix Cache Config
4545 enable_prefix_cache : bool = False ,
46- max_cached_blocks : int = 1000 ,
46+ max_cached_blocks : Optional [ int ] = None ,
4747 sliding_window : Optional [int ] = None ,
4848 ):
4949 self .num_layers = num_layers
@@ -83,6 +83,9 @@ def __init__(
8383 num_gpu_blocks = 0
8484
8585 self .num_gpu_blocks = num_gpu_blocks
86+ self .max_cached_blocks = (
87+ self .num_gpu_blocks if max_cached_blocks is None else max_cached_blocks
88+ )
8689
8790 # 1. Initialize Allocators
8891 self .allocator = (
@@ -121,10 +124,10 @@ def __init__(
121124 if enable_prefix_cache and self .needs_blocks :
122125 self .prefix_cache = BlockRadixCache (
123126 block_size = block_size ,
124- max_cached_blocks = max_cached_blocks ,
127+ max_cached_blocks = self . max_cached_blocks ,
125128 on_block_evict = self ._on_prefix_block_evict ,
126129 )
127- logger .info (f"Prefix cache enabled with max_cached_blocks={ max_cached_blocks } " )
130+ logger .info (f"Prefix cache enabled with max_cached_blocks={ self . max_cached_blocks } " )
128131
129132 # Mapping: request_id -> token_ids (for prefix matching)
130133 self .request_token_ids : Dict [str , List [int ]] = {}
@@ -548,11 +551,6 @@ def insert_full_blocks_to_cache(self, request_id: str):
548551 parent_path .append (new_node )
549552 registered_nodes .append (new_node )
550553
551- logger .debug (
552- f"Request { request_id } : Inserted block { block_idx } "
553- f"(block_id={ block_id } ) to prefix cache"
554- )
555-
556554 if registered_nodes :
557555 if request_id in self .prefix_cache .request_to_nodes :
558556 old_nodes = self .prefix_cache .request_to_nodes [request_id ]
0 commit comments