Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
322 commits
Select commit Hold shift + click to select a range
72a4d36
Refactor: Inline retract_all, delete the function
fzyzcjy May 28, 2026
94ea0fc
Refactor: Minimize batch_result_processor diff vs pre-refactor
fzyzcjy May 28, 2026
8b6f4d2
Refactor: Simplify pause_generation(retract) chunked release
fzyzcjy May 28, 2026
c25d10d
Refactor: Replace defensive only_decode_ready filters with asserts
fzyzcjy May 28, 2026
61f94e3
Refactor: Drop obsolete PP cross-mb idempotency guard in _handle_fini…
fzyzcjy May 28, 2026
6f0e4a9
Refactor host_hit_length reuse skip to explicit is_resume branch
fzyzcjy May 28, 2026
efffdef
Restore main-upstream add_chunked_req as temporary alias for dispatch…
fzyzcjy May 28, 2026
3d3f8ec
Split add_one_req into add_first_chunk_req / add_non_first_chunk_req …
fzyzcjy May 28, 2026
653b585
Clean is_resume residue from add_first_chunk_req
fzyzcjy May 28, 2026
fdc0efc
Adapt add_non_first_chunk_req to dev-f convention
fzyzcjy May 28, 2026
94ff96a
Add return type annotation to add_first_chunk_req
fzyzcjy May 28, 2026
5a75498
Apply black formatting to add_first_chunk_req signature
fzyzcjy May 28, 2026
c2d7f2f
Move add_non_first_chunk_req to match main-upstream's add_chunked_req…
fzyzcjy May 29, 2026
ff77a84
Replace Req.fill_ids array with derived fill_len
fzyzcjy May 29, 2026
b37ddd3
Minimize add_non_first_chunk_req diff vs main-upstream add_chunked_req
fzyzcjy May 29, 2026
6c89024
Tweak stale comment wording: 'truncates' -> 'shrinks' for fill_len
fzyzcjy May 29, 2026
12c662e
Apply black reformat
fzyzcjy May 29, 2026
0242fe0
Rename Req helpers for clarity
fzyzcjy May 29, 2026
19a48a9
Apply black reformat
fzyzcjy May 29, 2026
033174f
Make DLLM fill_len single-phase
fzyzcjy May 29, 2026
894bfdf
Revert fill_ids derive-only refactor
fzyzcjy May 29, 2026
9154b65
Refactor Req.fill_ids into (full_untruncated_fill_ids, fill_len)
fzyzcjy May 29, 2026
b1afd8d
Apply black reformat
fzyzcjy May 29, 2026
8890a1f
Update stale fill_ids references in test docstrings
fzyzcjy May 29, 2026
13d6ec5
Preserve OLD reset_for_retract behavior: don't clear fill_ids state
fzyzcjy May 29, 2026
9c5c20a
Use get_fill_ids() for the 3 reads in init/prefetch/DLLM-phase
fzyzcjy May 29, 2026
378c667
Restrict fill_len to truncated/committed semantics
fzyzcjy May 29, 2026
16f8ec8
Revert 3 reads to full_untruncated_fill_ids under PR2 semantics
fzyzcjy May 29, 2026
0dd2580
Allow PR test and lint workflows to trigger on non-main bases
fzyzcjy May 29, 2026
6a05dd4
Merge tom/ci_unblock_chain_pr_test into tom/refactor_retract_all
fzyzcjy May 29, 2026
bda5d5a
Merge tom/refactor_retract_all into tom/release_req_free_func
fzyzcjy May 29, 2026
33a861d
Merge tom/release_req_free_func into tom/refactor_fill_ids
fzyzcjy May 29, 2026
ac54584
Merge tom/refactor_fill_ids into tom/refactor_fill_ids_b
fzyzcjy May 29, 2026
9b1832b
Write fill_len in non-chunked add_one_req admission
fzyzcjy May 29, 2026
35b6d20
Use len(full_untruncated_fill_ids) + assert equivalence
fzyzcjy May 29, 2026
84cc52b
Write fill_len in add_one_req_ignore_eos non-chunked admission
fzyzcjy May 29, 2026
b8be787
Merge upstream/main into tom/refactor_retract_all
fzyzcjy May 31, 2026
6bde970
Merge tom/refactor_retract_all into tom/release_req_free_func
fzyzcjy May 31, 2026
86737d8
Merge tom/release_req_free_func into tom/refactor_fill_ids
fzyzcjy May 31, 2026
6ca7496
Translate new main fill_ids usage to fill_len in _compute_chunked_req…
fzyzcjy May 31, 2026
c9021a1
Merge tom/refactor_fill_ids into tom/refactor_fill_ids_b
fzyzcjy May 31, 2026
39a44ef
Add include_parallel_rank_in_filename option to dumper
fzyzcjy May 31, 2026
6052c6e
Remap pipeline-local layer indices to global in dump_model
fzyzcjy May 31, 2026
decc7fb
Merge tom/dumper_0531 into tom/refactor_retract_all
fzyzcjy May 31, 2026
e6a0837
Merge tom/refactor_retract_all into tom/release_req_free_func
fzyzcjy May 31, 2026
3bb5798
Merge tom/release_req_free_func into tom/refactor_fill_ids
fzyzcjy May 31, 2026
8501119
Merge tom/refactor_fill_ids into tom/refactor_fill_ids_b
fzyzcjy May 31, 2026
25f1175
Speed up dump comparator percentile computation using numpy
fzyzcjy Jun 1, 2026
35a0721
Merge tom/opt_dump_comparator_percentile into tom/refactor_retract_al…
fzyzcjy Jun 1, 2026
28bc9d9
Merge tom/refactor_retract_all into tom/release_req_free_func (chain)
fzyzcjy Jun 1, 2026
4978356
Merge tom/release_req_free_func into tom/refactor_fill_ids (chain)
fzyzcjy Jun 1, 2026
a1cfa46
Merge tom/refactor_fill_ids into tom/refactor_fill_ids_b (chain)
fzyzcjy Jun 1, 2026
24f6db3
Merge branch 'main' of github.com:sgl-project/sglang
fzyzcjy Jun 1, 2026
7d6f14f
Merge tom/dumper_0531 into tom/opt_dump_comparator_percentile (chain)
fzyzcjy Jun 1, 2026
6c6e18a
Merge tom/opt_dump_comparator_percentile into tom/refactor_retract_al…
fzyzcjy Jun 1, 2026
1451113
Merge tom/refactor_retract_all into tom/release_req_free_func (chain)
fzyzcjy Jun 1, 2026
32cc317
Merge tom/release_req_free_func into tom/refactor_fill_ids (chain)
fzyzcjy Jun 1, 2026
13027ac
Merge tom/refactor_fill_ids into tom/refactor_fill_ids_b (chain)
fzyzcjy Jun 1, 2026
6589689
Replace _chunked_req_scheduled_last_iter flag with content-based stas…
fzyzcjy Jun 1, 2026
787af65
Merge upstream/main into feat/stateless_scheduler_b
fzyzcjy Jun 1, 2026
86b2367
Remove obsolete imports after upstream merge
fzyzcjy Jun 1, 2026
67e2426
Merge PR #26850 (parallel-rank dump filenames + pipeline-global layer…
fzyzcjy Jun 1, 2026
b4558f1
Merge PR #26874 (speed up dump comparator percentile with numpy)
fzyzcjy Jun 1, 2026
b41b275
Merge PR #26547 (avoid filter_batch with unrelated chunked_req_to_exc…
fzyzcjy Jun 1, 2026
c20bfb0
Merge PR #26548 (extract release_req and retract_all as module-level …
fzyzcjy Jun 1, 2026
8244293
Merge PR #26637 (refactor Req.fill_ids into full_untruncated_fill_ids…
fzyzcjy Jun 1, 2026
b07412f
Merge PR #26659 (restrict Req fill_len to truncated/committed semantics)
fzyzcjy Jun 1, 2026
c71bdc1
Merge PR #26938 (content-based stash gate for chunked req)
fzyzcjy Jun 1, 2026
0ed8b6c
Merge upstream/main into tom/dumper_0531
fzyzcjy Jun 1, 2026
a2e0622
Merge tom/dumper_0531 into tom/opt_dump_comparator_percentile (chain)
fzyzcjy Jun 1, 2026
1c2bd6d
Merge tom/opt_dump_comparator_percentile into tom/refactor_retract_al…
fzyzcjy Jun 1, 2026
85090b8
Merge tom/refactor_retract_all into tom/release_req_free_func (chain)
fzyzcjy Jun 1, 2026
d45abc7
Merge tom/release_req_free_func into tom/refactor_fill_ids (chain)
fzyzcjy Jun 1, 2026
8cd38d5
Merge tom/refactor_fill_ids into tom/refactor_fill_ids_b (chain)
fzyzcjy Jun 1, 2026
530beae
Merge tom/refactor_fill_ids_b into tom/rm_chunked_req_scheduled_last_…
fzyzcjy Jun 1, 2026
9189129
Merge tom/rm_chunked_req_scheduled_last_iter into feat/stateless_sche…
fzyzcjy Jun 1, 2026
e2a6556
Fix scheduler pause generation unit test
fzyzcjy Jun 1, 2026
a39ec1d
Skip scheduled_extend_len bounds assert for DLLM reqs
fzyzcjy Jun 1, 2026
2bbf250
Migrate PP skip-output-comm validation off removed inflight_middle_ch…
fzyzcjy Jun 1, 2026
d0ead8a
Cover disagg-decode running reqs in abort_request
fzyzcjy Jun 1, 2026
8020925
Fix chunked_req AttributeError in prefill batch build
fzyzcjy Jun 1, 2026
a4187d8
Drop finished prefill-only reqs in is_prefill_only branch
fzyzcjy Jun 1, 2026
03706bf
Revert "Cover disagg-decode running reqs in abort_request"
fzyzcjy Jun 1, 2026
4328785
Revert "Migrate PP skip-output-comm validation off removed inflight_m…
fzyzcjy Jun 1, 2026
78320c1
Reapply "Migrate PP skip-output-comm validation off removed inflight_…
fzyzcjy Jun 1, 2026
412417b
Track disagg-decode reqs in active_reqs
fzyzcjy Jun 1, 2026
2005c85
Fix scheduled extend length for retracted decode requests
fzyzcjy Jun 1, 2026
57bc204
Freeze scheduled extend target length
fzyzcjy Jun 1, 2026
cacbe41
Assert empty prefix in add_one_req_ignore_eos chunked branch
fzyzcjy Jun 2, 2026
33f3f81
Merge tom/symmetric_ignore_eos_fill_len into tom/rm_chunked_req_sched…
fzyzcjy Jun 2, 2026
6e3e20a
Merge tom/rm_chunked_req_scheduled_last_iter into feat/stateless_sche…
fzyzcjy Jun 2, 2026
ed2910a
Document Req.fill_len semantics
fzyzcjy Jun 2, 2026
a8d4676
Merge remote-tracking branch 'upstream/main' into feat/stateless_sche…
fzyzcjy Jun 8, 2026
c590e11
Sandbox: trigger CI for stateless_scheduler_b (do not merge)
fzyzcjy Jun 8, 2026
66f2677
Fix stale SWATokenToKVPoolAllocator import after upstream merge
fzyzcjy Jun 8, 2026
1720a75
Fix stale get_chunked_req call in SchedulerLoadInquirer
fzyzcjy Jun 8, 2026
fdfe078
Adapt scripted runtime to stateless chunked_reqs() API
fzyzcjy Jun 8, 2026
ecc51ef
Drop hybrid-SWA tests for removed PrefillAdder.add_chunked_req
fzyzcjy Jun 8, 2026
95feb21
Key cache_unfinished_req on kv_committed_len instead of fill_len
fzyzcjy Jun 8, 2026
6e89bcb
Inline get_committed_fill_ids into cache_unfinished_req call sites
fzyzcjy Jun 8, 2026
d5e1e39
Assert fill_len == kv_committed_len in remaining cache_unfinished_req…
fzyzcjy Jun 8, 2026
21b77d6
Use None instead of 0 as Req.fill_len invalid sentinel
fzyzcjy Jun 8, 2026
ab6c992
Invalidate Req.fill_len to None on entering decode
fzyzcjy Jun 8, 2026
b879f3f
Drop verbose comment on fill_len decode invalidation
fzyzcjy Jun 8, 2026
9f338d8
Rename Req.fill_len to Req.extend_fill_len
fzyzcjy Jun 8, 2026
0e330b8
Avoid dual semantics of extend_input_len by computing the candidate o…
fzyzcjy Jun 9, 2026
835574e
Avoid scattered assignment of extend_input_len and extend_fill_len by…
fzyzcjy Jun 9, 2026
1704fb0
Inline extend_range accessors and remove the read-only properties
fzyzcjy Jun 9, 2026
9ed8aca
Merge PR #27611 (inline extend_range accessors, drop extend_input_len…
fzyzcjy Jun 9, 2026
77d0b0c
Remove newly-added comments and docstrings from the PR diff
fzyzcjy Jun 9, 2026
917ba7e
Remove dead ScheduleBatch fields resurrected by bad merges
fzyzcjy Jun 9, 2026
65e56bf
Key cache_unfinished_req on kv_committed_len instead of fill_len
fzyzcjy Jun 8, 2026
d0a8384
Inline get_committed_fill_ids into cache_unfinished_req call sites
fzyzcjy Jun 8, 2026
f435171
Use None instead of 0 as Req.fill_len invalid sentinel
fzyzcjy Jun 8, 2026
6e5ee1c
Invalidate Req.fill_len to None on entering decode
fzyzcjy Jun 8, 2026
14941f8
Assert fill_len == kv_committed_len in remaining cache_unfinished_req…
fzyzcjy Jun 8, 2026
fb47155
Drop verbose comment on fill_len decode invalidation
fzyzcjy Jun 8, 2026
a4e7bcb
Rename Req.fill_len to Req.extend_fill_len
fzyzcjy Jun 8, 2026
c17406d
Avoid dual semantics of extend_input_len by computing the candidate o…
fzyzcjy Jun 9, 2026
487eb9f
Avoid scattered assignment of extend_input_len and extend_fill_len by…
fzyzcjy Jun 9, 2026
3d6ddac
Inline extend_range accessors and remove the read-only properties
fzyzcjy Jun 9, 2026
2c08145
Replace Req.extend_logprob_start_len field with a pure free function
fzyzcjy Jun 9, 2026
20b68bc
Re-wire EAGLE chunked-prefill next-token chain onto stateless model
fzyzcjy Jun 9, 2026
81989e2
Carry over the original key-variables comment and flag set_extend_ran…
fzyzcjy Jun 9, 2026
880baa9
Pass the batch's extend_logprob_start_lens snapshot directly to batch…
fzyzcjy Jun 9, 2026
74ca62a
Drop the newly-added explanatory comments, keeping only carried-over …
fzyzcjy Jun 9, 2026
2a2d0fd
Minimize the padding hunk to the only forced change (field -> snapsho…
fzyzcjy Jun 9, 2026
317b8e0
Set unused extend_logprob_start_lens to None in prebuilt path
fzyzcjy Jun 9, 2026
f421f19
Drop comment on extend_logprob_start_lens None assignment
fzyzcjy Jun 9, 2026
4c5118c
Drop dead extend_input_logprob_token_ids boilerplate in prebuilt path
fzyzcjy Jun 9, 2026
805b69b
Drop the stale leading line of the carried key-variables comment
fzyzcjy Jun 9, 2026
497beb8
Derive has_pending_chunk from extend_range
fzyzcjy Jun 9, 2026
6da78b1
Derive output process mode and chunked next prompt token from extend_…
fzyzcjy Jun 9, 2026
fd6013e
Delete scheduled_extend_len and scheduled_extend_target_len fields
fzyzcjy Jun 9, 2026
3f4efd6
Update prefill adder tests for deleted scheduled_extend_len fields
fzyzcjy Jun 9, 2026
887f849
Apply black formatting to prefill adder test
fzyzcjy Jun 9, 2026
40c8be0
Merge tom/extend-logprob-start-len-free-fn into tom/stateless_schedul…
fzyzcjy Jun 9, 2026
f8ae413
Revert read_len alias in cache_unfinished_req to minimize diff
fzyzcjy Jun 9, 2026
a830903
Revert streaming_session chunked slice back to extend_range.end
fzyzcjy Jun 9, 2026
2f52bb4
Restore forward_pass_metrics module docstring removed in error
fzyzcjy Jun 9, 2026
35e48d0
Address PR review feedback on stateless-scheduler cleanup
fzyzcjy Jun 9, 2026
b6a0bea
Revert decode _pre_alloc extend_range start to total_prefix_len
fzyzcjy Jun 9, 2026
eccda4e
Restore prefix_indices comments, retargeting renamed add_non_first_ch…
fzyzcjy Jun 9, 2026
7e94cfb
Minimize gratuitous diff in stateless-scheduler cleanup
fzyzcjy Jun 9, 2026
f4fc1d1
Merge branch 'tom/stateless_scheduler_b_diffmin' into tom/stateless_s…
fzyzcjy Jun 9, 2026
9d7e85e
Note transient prefix_indices mismatch in extend_range TODO
fzyzcjy Jun 9, 2026
378f5d2
Drop redundant kv_committed_len >= cache_protected_len assert in cach…
fzyzcjy Jun 9, 2026
152b042
Restore dllm-stash before chunked-stash order to minimize diff
fzyzcjy Jun 9, 2026
f0afb38
Migrate DEBUG_INVARIANTS to SGLANG_DEBUG_REQS_INVARIANTS via environ
fzyzcjy Jun 9, 2026
41b7603
Return a list from chunked_reqs instead of a lazy iterable
fzyzcjy Jun 9, 2026
bc3dbce
Deactivate optimistic-bootstrap reqs on failure and requeue
fzyzcjy Jun 9, 2026
3dccc8f
Deactivate finished prebuilt reqs in process_batch_result_prebuilt
fzyzcjy Jun 9, 2026
9981bae
Merge correctness fixes: deactivate aborted/requeued bootstrap reqs a…
fzyzcjy Jun 9, 2026
74b85fb
Restore stash_chunked_request helper instead of inlining maybe_cache_…
fzyzcjy Jun 9, 2026
c6fad8a
Move stash_chunked_request back to its original position before _buil…
fzyzcjy Jun 9, 2026
894e738
Drop underscore prefix from chunked_in_active local var
fzyzcjy Jun 9, 2026
ccdd942
Drop DLLM_* output-process modes in favor of is_intermediate + is_dllm
fzyzcjy Jun 9, 2026
5abb0ee
Remove derived ScheduleBatch.chunked_req in favor of output_process_mode
fzyzcjy Jun 9, 2026
88d429d
Clamp cache_unfinished_req cached length to the prompt boundary
fzyzcjy Jun 9, 2026
4a650ae
Merge tom/cache-unfinished-req-use-committed-len (clamp cache_unfinis…
fzyzcjy Jun 9, 2026
3a5dfee
Merge tom/req-fill-len-none (clamp cache_unfinished_req length)
fzyzcjy Jun 9, 2026
71c53cb
Merge tom/rename-fill-len-extend-fill-len (clamp cache_unfinished_req…
fzyzcjy Jun 9, 2026
20f66b3
Merge tom/extend-range-consolidate (clamp cache_unfinished_req length)
fzyzcjy Jun 9, 2026
240b8c7
Merge tom/extend-candidate-on-demand (clamp cache_unfinished_req length)
fzyzcjy Jun 9, 2026
1a4fc3a
Fix test doubles for the extend_range / kv_committed_len migration
fzyzcjy Jun 9, 2026
8baeca8
Merge tom/extend-range-inline (clamp cache_unfinished_req length + te…
fzyzcjy Jun 9, 2026
6ec36b4
Replace output_process_mode enum with is_extend_intermediate bool
fzyzcjy Jun 9, 2026
b2d74d8
Name full_untruncated_fill_ids locals after their getter
fzyzcjy Jun 9, 2026
36785eb
Rename _decide_is_extend_intermediate to _compute_is_extend_intermediate
fzyzcjy Jun 9, 2026
0464dba
Rename local is_intermediate loop var to is_extend_intermediate
fzyzcjy Jun 9, 2026
a33d841
Remove dead ScheduleBatch.is_hybrid_swa field missed in upstream merge
fzyzcjy Jun 9, 2026
df9187c
Merge upstream/main (via tom/remove-full-untruncated-fill-ids)
fzyzcjy Jun 9, 2026
4ef982d
Merge upstream/main (via casc-27573)
fzyzcjy Jun 9, 2026
a0659c7
Merge upstream/main (via casc-27571)
fzyzcjy Jun 9, 2026
c3373c2
Merge upstream/main (via casc-27575)
fzyzcjy Jun 9, 2026
6a50dde
Merge upstream/main (via casc-27616)
fzyzcjy Jun 9, 2026
b00a6ca
Merge upstream/main (via casc-27610)
fzyzcjy Jun 9, 2026
27ff868
Merge upstream/main (via casc-27611)
fzyzcjy Jun 9, 2026
1c08bb5
Restore DLLM request abort dropped by active_reqs migration
fzyzcjy Jun 9, 2026
21655c6
Merge tom/extend-logprob-start-len-free-fn (chain + upstream/main)
fzyzcjy Jun 9, 2026
839b1eb
Revert local rename to full_untruncated_fill_ids
fzyzcjy Jun 9, 2026
7e71be0
Align stale padding comment with extend_range/extend_logprob_start_lens
fzyzcjy Jun 9, 2026
6b568c9
Rename filter_batch only_decode_ready to skip_extend_intermediate
fzyzcjy Jun 9, 2026
fb33122
Apply black formatting after filter_batch kwarg rename
fzyzcjy Jun 9, 2026
8c3e1cc
Reflow padding comment to match upstream #27625 wording
fzyzcjy Jun 9, 2026
8f0f736
Merge tom/extend-logprob-start-len-free-fn (chain: upstream padding-c…
fzyzcjy Jun 9, 2026
49bce4e
Move is_extend_intermediate merge to end of merge_batch
fzyzcjy Jun 9, 2026
2a1d6d9
Assert extend_range.end > 0 in has_pending_chunk instead of guarding
fzyzcjy Jun 9, 2026
4163212
Rename chunked-req scheduler state to partially_extended vocabulary
fzyzcjy Jun 9, 2026
e076037
Apply pre-commit formatting after partially_extended rename
fzyzcjy Jun 9, 2026
691d450
Merge tom/extend-logprob-start-len-free-fn (rebuilt chain: drop None …
fzyzcjy Jun 12, 2026
61d90da
Rename cache flag chunked to is_partially_extended
fzyzcjy Jun 12, 2026
04655d4
Make maybe_cache_unfinished_req flag parameter explicit
fzyzcjy Jun 12, 2026
8715e13
Apply pre-commit auto-fixes
fzyzcjy Jun 12, 2026
9f8c2c3
Track the request lifecycle phase explicitly via Req.phase
fzyzcjy Jun 12, 2026
bc75ecf
Enter the extend phase at set_extend_range and base is_partially_exte…
fzyzcjy Jun 12, 2026
670e2bc
Rename ReqPhase.QUEUED to OTHERS since it covers every untracked stat…
fzyzcjy Jun 12, 2026
1562ea0
Cross-check Req.phase against committed KV in the busy invariant check
fzyzcjy Jun 12, 2026
e8230fe
Apply isort to invariant_checker import order
fzyzcjy Jun 12, 2026
90b4b50
Remove resurrected v1_spec_info_filtered parameter from filter_batch
fzyzcjy Jun 12, 2026
9b13478
Remove duplicated compute_extend_logprob_start_len definition
fzyzcjy Jun 12, 2026
0aaf49c
Restore hybrid-SWA chunked prefill tests on the resumed-extend API
fzyzcjy Jun 12, 2026
e61aec5
Restore stash-gate regression tests on the stateless scheduler model
fzyzcjy Jun 12, 2026
7f0e3b4
Drop stale spec_v1 reference from merge_batch comment
fzyzcjy Jun 12, 2026
41c01df
Move the partially-extended batch query after prepare_for_extend
fzyzcjy Jun 12, 2026
29ded7a
Drop the explanatory comment on the moved partially-extended query
fzyzcjy Jun 12, 2026
1da4f3d
Move the ReqPhase.EXTEND transition from set_extend_range to prepare_…
fzyzcjy Jun 12, 2026
349790c
Assert that an active req holding an extend range is in a tracked phase
fzyzcjy Jun 12, 2026
6f83434
Fix stale chunked_req reference in dflash prefill delay check
fzyzcjy Jun 12, 2026
f7f07a9
Include partially-extended reqs in is_fully_idle batch status
fzyzcjy Jun 12, 2026
1b58e9a
Always run retract path in pause_generation regardless of running_batch
fzyzcjy Jun 12, 2026
6c2ac56
Reset req phase to OTHERS on bootstrap failure
fzyzcjy Jun 12, 2026
11095d1
Add strict=True to is_extend_intermediate zip in eagle prefill tail t…
fzyzcjy Jun 12, 2026
35202aa
Snapshot extend intermediacy at admission to kill phantom partially-e…
fzyzcjy Jun 12, 2026
478b3fc
Stop resuming a chunked prefill when its bootstrap poll is deferred
fzyzcjy Jun 12, 2026
224bc67
Exclude finished reqs from the running-subset-of-active debug assert
fzyzcjy Jun 12, 2026
8d2b4c8
Fix hand-built reqs in test_prefill_adder and align with snapshot sem…
fzyzcjy Jun 12, 2026
e318072
Rename add_first_extend_req to add_unstarted_extend_req
fzyzcjy Jun 12, 2026
9f56bce
Restore decode-radix re-match for retracted-resumed reqs
fzyzcjy Jun 12, 2026
8a56bc0
Revert the is_extend_intermediate snapshot field for redesign
fzyzcjy Jun 12, 2026
e801bcc
Split ReqPhase.EXTEND into EXTEND_NON_LAST and EXTEND_LAST
fzyzcjy Jun 12, 2026
00e2c3a
Set ReqPhase on hand-built reqs in test_prefill_adder
fzyzcjy Jun 12, 2026
56510dd
Inline is_partially_extended and ReqPhase.is_extend into their call s…
fzyzcjy Jun 12, 2026
b94651d
Set req.phase at the PrefillAdder admission decision instead of prepa…
fzyzcjy Jun 12, 2026
9e7d8ca
Apply black formatting to partially_extended_reqs
fzyzcjy Jun 12, 2026
1154594
Update stale comment: the PrefillAdder now derives the phase
fzyzcjy Jun 12, 2026
7156040
Set EXTEND_LAST on the dynamic-chunking profiler req
fzyzcjy Jun 12, 2026
74f4643
Replace dead is_partially_extended mock kwarg with phase
fzyzcjy Jun 12, 2026
2fceeb8
Move the DECODE phase transition before the spec early-return
fzyzcjy Jun 12, 2026
eaf483f
Report zero extend lengths for decode batches in run_batch
fzyzcjy Jun 12, 2026
3416dea
Exclude the dLLM mask tail from cache_unfinished_req
fzyzcjy Jun 12, 2026
23fb972
Set ReqPhase.DECODE in spec-v2 prepare_for_decode paths
fzyzcjy Jun 12, 2026
8f8abd4
Revert "Set ReqPhase.DECODE in spec-v2 prepare_for_decode paths"
fzyzcjy Jun 12, 2026
6e72294
Move the DECODE phase transition before the spec early-return
fzyzcjy Jun 12, 2026
24d4c88
Drop dead extend_range/kv_committed_len None guards
fzyzcjy Jun 12, 2026
7ebd8e6
Restore the kv_committed_len None guard in get_new_prebuilt_batch
fzyzcjy Jun 12, 2026
0e47c4a
Revert "Exclude the dLLM mask tail from cache_unfinished_req"
fzyzcjy Jun 12, 2026
33dfee2
Merge branch 'tom/stateless_scheduler_b' into tom/stateless_scheduler…
fzyzcjy Jun 12, 2026
8a04683
Dummy commit to retrigger CI (reverted next commit)
fzyzcjy Jun 12, 2026
ad8f86c
Revert "Dummy commit to retrigger CI (reverted next commit)"
fzyzcjy Jun 12, 2026
b1a5d93
Migrate manual chunked-prefill tests off removed Scheduler.chunked_req
fzyzcjy Jun 12, 2026
fc9f42a
Apply black/isort formatting to chunked_req_of migration
fzyzcjy Jun 12, 2026
4a34564
Migrate manual tests off removed Req.inflight_middle_chunks
fzyzcjy Jun 12, 2026
9716bce
Apply isort to inflight_middle_chunks_of migration
fzyzcjy Jun 12, 2026
c7e2f1a
Replace removed disable_piecewise_cuda_graph in manual chunked tests
fzyzcjy Jun 12, 2026
58f1580
Migrate manual tests off removed Req.extend_input_len
fzyzcjy Jun 12, 2026
8b24faf
Apply black/isort formatting to extend_input_len_of migration
fzyzcjy Jun 12, 2026
6a02e1c
Migrate manual test off removed Req.fill_ids attribute
fzyzcjy Jun 12, 2026
96ab4d8
Assert decode-req path in prefill-adder intermediate test
fzyzcjy Jun 14, 2026
f68b908
Cover partially-extended retract branch in pause_generation test
fzyzcjy Jun 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 25 additions & 8 deletions python/sglang/srt/disaggregation/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
)
from sglang.srt.environ import envs
from sglang.srt.layers.dp_attention import get_attention_tp_size
from sglang.srt.managers.schedule_batch import FINISH_ABORT, ScheduleBatch
from sglang.srt.managers.schedule_batch import FINISH_ABORT, ReqPhase, ScheduleBatch
from sglang.srt.managers.schedule_policy import match_prefix_for_req
from sglang.srt.managers.utils import GenerationBatchResult
from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
Expand All @@ -79,6 +79,7 @@
ReqToTokenPool,
)
from sglang.srt.mem_cache.swa_memory_pool import SWAKVPool
from sglang.srt.model_executor.forward_batch_info import ForwardMode
from sglang.srt.observability.req_time_stats import (
set_schedule_time_batch,
set_time_batch,
Expand Down Expand Up @@ -154,9 +155,8 @@ def alloc(self, reqs: List[Req]) -> Optional[List[int]]:
len(reusing) <= 1
), "only one chunked request may reuse req_pool_idx in a batch"
assert all(
reqs[i].inflight_middle_chunks > 0 or reqs[i].kv_committed_len > 0
for i in reusing
), "reusing request must be chunked or have committed KV"
reqs[i].kv_committed_len > 0 for i in reusing
), "reusing request must have committed KV"

need_size = len(reqs) - len(reusing)
if need_size > len(self.free_slots):
Expand Down Expand Up @@ -1396,7 +1396,7 @@ def _pre_alloc(
kv_loc,
)

# Truncate fill_len to kv_committed_len so cache_unfinished_req only
# Truncate extend_fill_len to kv_committed_len so cache_unfinished_req only
# inserts committed KV into the radix tree. The last output token
# hasn't had KV committed yet (output_ids is 1 ahead).
# Set prefix_indices so downstream consumers (init_next_round_input,
Expand All @@ -1406,7 +1406,16 @@ def _pre_alloc(
req.prefix_indices = (
prefix_indices if prefix_len > 0 else torch.empty((0,), dtype=torch.int64)
)
# TODO: start can transiently disagree with len(prefix_indices) under HiCache
# decode prefetch, but it is behavior-neutral — only .end is read before
# get_new_prebuilt_batch resets extend_range ahead of the prebuilt forward.
req.set_extend_range(total_prefix_len, req.kv_committed_len)
# This prebuilt path never goes through the PrefillAdder, so enter
# the extend phase here; prepare_for_decode moves it to DECODE later.
# These reqs are not a real chunk sequence; NON_LAST keeps them visible
# as holders of not-yet-batched extend resources (pool stats / invariant
# checker) until prepare_for_decode flips them to DECODE.
req.phase = ReqPhase.EXTEND_NON_LAST

# Return the transfer destination indices:
if self.scheduler.enable_hisparse:
Expand Down Expand Up @@ -1805,11 +1814,13 @@ def get_next_disagg_decode_batch_to_run(
# Process pending prebuilt batch: output processing + filter + merge
new_prebuilt_batch = self.get_new_prebuilt_batch()
if new_prebuilt_batch:
assert self.chunked_req is None
self.batch_result_processor.process_batch_result_prebuilt(
new_prebuilt_batch
)
new_prebuilt_batch.filter_batch()
is_extend_intermediate = new_prebuilt_batch.is_extend_intermediate or []
assert not any(
is_extend_intermediate
), "prebuilt batch carries intermediate-mode reqs"
if not new_prebuilt_batch.is_empty():
Comment on lines +1820 to 1824

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The call to new_prebuilt_batch.filter_batch() was removed. This means finished requests are not filtered out of the prebuilt batch before it is merged into self.running_batch. Consequently, finished requests (whose KV cache has already been released) will be merged and executed again in the next forward pass, leading to redundant execution and potential CUDA/assertion crashes. Calling filter_batch() after the assertion prevents this.

            is_extend_intermediate = new_prebuilt_batch.is_extend_intermediate or []
            assert not any(
                is_extend_intermediate
            ), "prebuilt batch carries intermediate-mode reqs"
            new_prebuilt_batch.filter_batch()
            if not new_prebuilt_batch.is_empty():

if self.running_batch.is_empty():
self.running_batch = new_prebuilt_batch
Expand Down Expand Up @@ -1860,6 +1871,7 @@ def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
# we can only add at least `num_not_used_batch` new batch to the running queue
if i < num_not_used_batch:
can_run_list.append(req)
self._activate_req(req)
# Decode-radix path: new requests already matched in
# `pop_preallocated`. Retracted requests reset `last_node`,
# so re-match only when that state is missing.
Expand All @@ -1868,11 +1880,15 @@ def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
else:
tree_cache = self.tree_cache
req.init_next_round_input(tree_cache)
# Truncate fill_len to kv_committed_len so cache_unfinished_req
# Truncate extend_fill_len to kv_committed_len so cache_unfinished_req
# only sees committed KV (full array includes one uncommitted
# token because init_next_round_input rebuilt it as full).
if req.kv_committed_len is not None:
req.set_extend_range(len(req.prefix_indices), req.kv_committed_len)
# This prebuilt path never goes through the PrefillAdder,
# so enter the extend phase here; prepare_for_decode moves
# it to DECODE later.
req.phase = ReqPhase.EXTEND_NON_LAST
else:
waiting_queue.append(req)

Expand All @@ -1891,6 +1907,7 @@ def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
self.model_config,
self.enable_overlap,
self.spec_algorithm,
forward_mode=ForwardMode.PREBUILT,
)

# construct fake completed prefill
Expand Down
81 changes: 50 additions & 31 deletions python/sglang/srt/disaggregation/prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
FINISH_ABORT,
FINISH_LENGTH,
Req,
ReqPhase,
ScheduleBatch,
)
from sglang.srt.mem_cache.common import (
Expand Down Expand Up @@ -561,7 +562,7 @@ def advance_logprob_pt(i: int, req: Req) -> None:
optimistic_reqs = [
(i, req)
for i, req in enumerate(batch.reqs)
if req.pending_bootstrap and req.inflight_middle_chunks <= 0
if req.pending_bootstrap and req.phase is not ReqPhase.EXTEND_NON_LAST
]
if optimistic_reqs:
polls = poll_and_all_reduce_attn_cp_tp_group(
Expand All @@ -573,10 +574,15 @@ def advance_logprob_pt(i: int, req: Req) -> None:
idx: poll for (idx, _), poll in zip(optimistic_reqs, polls)
}

for i, (req, next_token_id) in enumerate(
zip(batch.reqs, next_token_ids, strict=True)
for i, (req, next_token_id, is_extend_intermediate) in enumerate(
zip(
batch.reqs,
next_token_ids,
batch.is_extend_intermediate,
strict=True,
)
):
if req.inflight_middle_chunks <= 0:
if not is_extend_intermediate:
req.time_stats.set_prefill_finished_time()

# For optimistic requests, check bootstrap before side effects
Expand Down Expand Up @@ -622,15 +628,15 @@ def advance_logprob_pt(i: int, req: Req) -> None:
except ValueError as e:
error_message = f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
release_kv_cache(req, self.tree_cache)
self._deactivate_req(req)
prepare_abort(
req,
error_message,
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
)
req.grammar.finished = req.finished()
else:
# being chunked reqs' prefill is not finished
req.inflight_middle_chunks -= 1
# being partially-extended reqs' prefill is not finished

# Overlap deferred release for optimistic requests stopped in process_prefill_chunk
if req.pending_bootstrap:
Expand Down Expand Up @@ -722,6 +728,7 @@ def process_disagg_prefill_inflight_queue(
undone_reqs.append(req)
elif poll == KVPoll.Success: # transfer done
release_kv_cache(req, self.tree_cache) # unlock the tree
self._deactivate_req(req)
req.finished_reason = FINISH_LENGTH(length=0)
# FIXME: clean up req's data in transfer engine
if hasattr(req.disagg_kv_sender, "clear"):
Expand All @@ -743,6 +750,7 @@ def process_disagg_prefill_inflight_queue(
logger.warning(error_message)
req.time_stats.trace_ctx.abort(abort_info={"reason": error_message})
release_kv_cache(req, self.tree_cache) # unlock the tree
self._deactivate_req(req)
prepare_abort(
req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
)
Expand Down Expand Up @@ -839,6 +847,15 @@ def handle_bootstrap_failure(self: Scheduler, req: Req) -> None:
self.metrics_collector.increment_bootstrap_failed_reqs()
if self.enable_hicache_storage:
self.tree_cache.release_aborted_request(req.rid)
# The stateless scheduler derives the current partially-extended req from
# partially_extended_reqs() = active_reqs entries in the EXTEND_NON_LAST
# phase, which ignores req.finished(). An aborted req still
# sitting in active_reqs with a non-None extend_range would be re-derived
# as a partially-extended req and crash process_prefill_chunk (req_pool_idx=None).
# Remove it from active_reqs and clear the extend state defensively.
req.extend_range = None
req.phase = ReqPhase.OTHERS
self._deactivate_req(req)

def handle_pending_bootstrap(
self: Scheduler, req: Req, poll: KVPoll, defer_release: bool
Expand Down Expand Up @@ -881,35 +898,31 @@ def check_bootstrap(self: Scheduler, req: Req) -> bool:
)

def process_prefill_chunk(self: Scheduler) -> None:
chunked_req_to_exclude = set()
if self.chunked_req:
chunked_req_to_exclude.add(self.chunked_req)
maybe_cache_unfinished_req(self.chunked_req, self.tree_cache, chunked=True)

if not self.check_bootstrap(self.chunked_req):
self.chunked_req = None # stop the current chunked prefill
elif self.enable_overlap:
# Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
self.chunked_req.tmp_end_idx = min(
self.chunked_req.extend_range.end,
len(self.chunked_req.origin_input_ids),
)
else:
self.send_kv_chunk(self.chunked_req)

if self.chunked_req is not None:
partially_extended_req = next(iter(self.partially_extended_reqs()), None)
if partially_extended_req is not None:
maybe_cache_unfinished_req(
partially_extended_req, self.tree_cache, is_partially_extended=True
)
if self.check_bootstrap(partially_extended_req):
if self.enable_overlap:
# Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
partially_extended_req.tmp_end_idx = min(
partially_extended_req.extend_range.end,
len(partially_extended_req.origin_input_ids),
)
else:
self.send_kv_chunk(partially_extended_req)
self.running_batch.batch_is_full = False
else:
# Bootstrap not ready (deferred overlap poll) or failed: stop resuming
# this chunked prefill, mirroring the old `chunked_req = None`. The
# deferred optimistic_release_and_requeue (or the failure handler) owns
# the req from here; deactivating again there is an idempotent no-op.
self._deactivate_req(partially_extended_req)

if self.last_batch and self.last_batch.forward_mode.is_extend():
if self.last_batch.chunked_req:
# In the context pipeline parallelism, after the last chunk, the current microbatch still track outdated chunked_req.
# We need to discard it.
chunked_req_to_exclude.add(self.last_batch.chunked_req)

last_bs = self.last_batch.batch_size()
self.last_batch.filter_batch(
chunked_req_to_exclude=list(chunked_req_to_exclude)
)
self.last_batch.filter_batch(skip_extend_intermediate=True)
if self.last_batch.batch_size() < last_bs:
self.running_batch.batch_is_full = False

Expand Down Expand Up @@ -1017,6 +1030,12 @@ def optimistic_release_and_requeue(self: Scheduler, req: Req) -> None:
maybe_cache_unfinished_req(req, self.tree_cache)
release_kv_cache(req, self.tree_cache)
req.reset_for_retract()
# reset_for_retract() clears extend_range, but the req is still in
# active_reqs. Since the stateless scheduler derives partially_extended_reqs() from
# active_reqs, a requeued req that also remains active would be
# double-tracked (stale partially-extended req, load/accounting leak). Deactivate it
# before re-enqueue; get_next_batch_to_run reactivates it on reschedule.
self._deactivate_req(req)
req.output_ids = array("q")
req.start_send_idx = 0
req.tmp_end_idx = -1
Expand Down
9 changes: 1 addition & 8 deletions python/sglang/srt/dllm/mixin/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,6 @@ def _update_state_for_batch(

if can_run_list:
self.dllm_manager.add_staging_reqs(can_run_list)
self.dllm_manager.increment_inflight_middle_chunks()

self.adder = adder
self.can_run_list = can_run_list
Expand Down Expand Up @@ -255,9 +254,8 @@ def process_dllm_incoming_reqs(

# Prepare and add request
req.init_next_round_input(self.tree_cache)
res = adder.add_one_req(
res = adder.add_unstarted_extend_req(
req,
has_chunked_req=True,
truncation_align_size=self.truncation_align_size,
)

Expand Down Expand Up @@ -337,11 +335,6 @@ def is_empty(self) -> bool:
return True
return len(self.waiting_queue) == 0

def increment_inflight_middle_chunks(self) -> None:
"""Increment chunked count for all staging requests."""
for req in self.staging_queue:
req.inflight_middle_chunks += 1

def filter_finished_reqs(self) -> None:
"""Remove finished requests from both queues."""
self.waiting_queue = [req for req in self.waiting_queue if not req.finished()]
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ class Envs:
SGLANG_RECORD_STEP_TIME = EnvBool(False)
SGLANG_FORCE_SHUTDOWN = EnvBool(False)
SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
SGLANG_DEBUG_REQS_INVARIANTS = EnvBool(False)
SGLANG_DEBUG_REVERT_PR = EnvInt(0)
SGLANG_PHASE_CHECKER_DEBUG = EnvBool(False)
SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
Expand Down
Loading
Loading