diff --git a/examples/runtime/hidden_states/hidden_states_engine.py b/examples/runtime/hidden_states/hidden_states_engine.py index 8af883ab1b..b130d8757d 100644 --- a/examples/runtime/hidden_states/hidden_states_engine.py +++ b/examples/runtime/hidden_states/hidden_states_engine.py @@ -28,6 +28,7 @@ def main(): "temperature": 0.8, "top_p": 0.95, "max_new_tokens": 10, + "n": 2, # ensure prefix cache will take effect } outputs = llm.generate( diff --git a/examples/runtime/hidden_states/hidden_states_server.py b/examples/runtime/hidden_states/hidden_states_server.py index 96045fad9d..d0602c8a05 100644 --- a/examples/runtime/hidden_states/hidden_states_server.py +++ b/examples/runtime/hidden_states/hidden_states_server.py @@ -38,6 +38,7 @@ def main(): "temperature": 0.8, "top_p": 0.95, "max_new_tokens": 10, + "n": 2, # ensure prefix cache will take effect } json_data = { diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py index d2a450aecb..edf4aa118c 100644 --- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py +++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py @@ -117,7 +117,7 @@ def process_batch_result_prefill( logits_output.hidden_states[ hidden_state_offset : ( hidden_state_offset := hidden_state_offset - + len(req.origin_input_ids) + + req.extend_input_len ) ] .cpu()