Skip to content

Commit d053027

Browse files
committed
set default values
1 parent cf4fc0a commit d053027

File tree

5 files changed

+23
-14
lines changed

5 files changed

+23
-14
lines changed

python/sglang/srt/configs/model_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def get_total_num_kv_heads(self) -> int:
288288
num_kv_heads = getattr(self.hf_text_config, attr, None)
289289
if num_kv_heads is not None:
290290
return num_kv_heads
291-
291+
292292
# For non-grouped-query attention models, the number of KV heads is
293293
# equal to the number of attention heads.
294294
return self.hf_text_config.num_attention_heads

python/sglang/srt/managers/schedule_batch.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,7 +1168,7 @@ def prepare_for_extend(self):
11681168
(req.req_pool_idx, slice(0, pre_len)), req.prefix_indices_local
11691169
)
11701170
self.tree_cache.evict_hybrid(
1171-
req, self.model_config.attention_chunk_size
1171+
req, pre_len, self.model_config.attention_chunk_size
11721172
)
11731173

11741174
# If input_embeds are available, store them
@@ -1560,6 +1560,13 @@ def prepare_for_decode(self):
15601560
self.seq_lens.add_(1)
15611561
self.seq_lens_sum += bs
15621562

1563+
# free memory
1564+
if self.token_to_kv_pool_allocator_local is not None:
1565+
for req in self.reqs:
1566+
self.tree_cache.evict_hybrid(
1567+
req, req.seqlen - 1, self.model_config.attention_chunk_size
1568+
)
1569+
15631570
# Allocate memory
15641571
if self.token_to_kv_pool_allocator.page_size == 1:
15651572
self.out_cache_loc = self.alloc_token_slots(bs)
@@ -1582,10 +1589,6 @@ def prepare_for_decode(self):
15821589
self.req_to_token_pool.write_local(
15831590
(self.req_pool_indices, locs), self.out_cache_loc_local.to(torch.int32)
15841591
)
1585-
for req in self.reqs:
1586-
self.tree_cache.evict_hybrid(
1587-
req, self.model_config.attention_chunk_size
1588-
)
15891592

15901593
def filter_batch(
15911594
self,

python/sglang/srt/mem_cache/chunk_cache.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,18 @@ def insert(self):
7272
def evict_hybrid(
7373
self,
7474
req: Req,
75+
prelen: int,
7576
attention_chunk_size: int,
7677
):
77-
if req.seqlen > req.evicted_seqlen_local + attention_chunk_size:
78-
loc_idx = attention_chunk_size * (req.seqlen // attention_chunk_size)
79-
with open("log.txt", "a") as f:
80-
f.write(f"seqlen: {req.seqlen}, loc_idx: {loc_idx}\n")
78+
if prelen >= req.evicted_seqlen_local + attention_chunk_size:
79+
new_evicted_seqlen_local = attention_chunk_size * (
80+
prelen // attention_chunk_size
81+
)
8182
free_slots = self.req_to_token_pool.req_to_token_local[
82-
req.req_pool_idx, req.evicted_seqlen_local : loc_idx
83+
req.req_pool_idx, req.evicted_seqlen_local : new_evicted_seqlen_local
8384
]
8485
self.token_to_kv_pool_allocator_local.free(free_slots)
85-
req.evicted_seqlen_local = loc_idx
86+
req.evicted_seqlen_local = new_evicted_seqlen_local
8687

8788
def evict(self, num_tokens: int):
8889
pass

python/sglang/srt/model_executor/model_runner.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,12 @@ def init_memory_pool(
887887

888888
# create token size for hybrid cache
889889
if self.is_hybrid is not None:
890-
self.get_num_token_hybrid()
890+
if self.server_args.disable_radix_cache:
891+
self.get_num_token_hybrid()
892+
else:
893+
raise RuntimeError(
894+
"Hybrid cache does not support radix_cache currently. Please set --diable-radix-cache."
895+
)
891896

892897
if self.max_total_num_tokens <= 0:
893898
raise RuntimeError(

python/sglang/srt/server_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
696696
parser.add_argument(
697697
"--enable-hybrid-kvcache",
698698
nargs="?",
699-
const=1.0,
699+
const=0.5,
700700
type=float,
701701
default=ServerArgs.enable_hybrid_kvcache,
702702
help=(

0 commit comments

Comments
 (0)