set default values

tarinkk · tarinkk · commit d053027d2f53 · 2025-05-25T00:29:06.000Z
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -288,7 +288,7 @@ def get_total_num_kv_heads(self) -> int:
             num_kv_heads = getattr(self.hf_text_config, attr, None)
             if num_kv_heads is not None:
                 return num_kv_heads
-
+            
         # For non-grouped-query attention models, the number of KV heads is
         # equal to the number of attention heads.
         return self.hf_text_config.num_attention_heads
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
@@ -1168,7 +1168,7 @@ def prepare_for_extend(self):
                         (req.req_pool_idx, slice(0, pre_len)), req.prefix_indices_local
                     )
                     self.tree_cache.evict_hybrid(
-                        req, self.model_config.attention_chunk_size
+                        req, pre_len, self.model_config.attention_chunk_size
                     )
 
             # If input_embeds are available, store them
@@ -1560,6 +1560,13 @@ def prepare_for_decode(self):
             self.seq_lens.add_(1)
         self.seq_lens_sum += bs
 
+        # free memory
+        if self.token_to_kv_pool_allocator_local is not None:
+            for req in self.reqs:
+                self.tree_cache.evict_hybrid(
+                    req, req.seqlen - 1, self.model_config.attention_chunk_size
+                )
+
         # Allocate memory
         if self.token_to_kv_pool_allocator.page_size == 1:
             self.out_cache_loc = self.alloc_token_slots(bs)
@@ -1582,10 +1589,6 @@ def prepare_for_decode(self):
             self.req_to_token_pool.write_local(
                 (self.req_pool_indices, locs), self.out_cache_loc_local.to(torch.int32)
             )
-            for req in self.reqs:
-                self.tree_cache.evict_hybrid(
-                    req, self.model_config.attention_chunk_size
-                )
 
     def filter_batch(
         self,
diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py
@@ -72,17 +72,18 @@ def insert(self):
     def evict_hybrid(
         self,
         req: Req,
+        prelen: int,
         attention_chunk_size: int,
     ):
-        if req.seqlen > req.evicted_seqlen_local + attention_chunk_size:
-            loc_idx = attention_chunk_size * (req.seqlen // attention_chunk_size)
-            with open("log.txt", "a") as f:
-                f.write(f"seqlen: {req.seqlen}, loc_idx: {loc_idx}\n")
+        if prelen >= req.evicted_seqlen_local + attention_chunk_size:
+            new_evicted_seqlen_local = attention_chunk_size * (
+                prelen // attention_chunk_size
+            )
             free_slots = self.req_to_token_pool.req_to_token_local[
-                req.req_pool_idx, req.evicted_seqlen_local : loc_idx
+                req.req_pool_idx, req.evicted_seqlen_local : new_evicted_seqlen_local
             ]
             self.token_to_kv_pool_allocator_local.free(free_slots)
-            req.evicted_seqlen_local = loc_idx
+            req.evicted_seqlen_local = new_evicted_seqlen_local
 
     def evict(self, num_tokens: int):
         pass
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
@@ -887,7 +887,12 @@ def init_memory_pool(
 
         # create token size for hybrid cache
         if self.is_hybrid is not None:
-            self.get_num_token_hybrid()
+            if self.server_args.disable_radix_cache:
+                self.get_num_token_hybrid()
+            else:
+                raise RuntimeError(
+                    "Hybrid cache does not support radix_cache currently. Please set --diable-radix-cache."
+                )
 
         if self.max_total_num_tokens <= 0:
             raise RuntimeError(
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -696,7 +696,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument(
             "--enable-hybrid-kvcache",
             nargs="?",
-            const=1.0,
+            const=0.5,
             type=float,
             default=ServerArgs.enable_hybrid_kvcache,
             help=(