Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
627 commits
Select commit Hold shift + click to select a range
a9d4d99
Drop thin scripted-runtime wrappers, keep only non-trivial APIs
fzyzcjy May 31, 2026
5053d8a
Apply black to scripted-runtime core tests
fzyzcjy May 31, 2026
0216b40
Simplify list_active_reqs and require start_req plumbing kwargs
fzyzcjy May 31, 2026
3cde89b
Restore kv_pages and engine_stats as genuine multi-line wrappers
fzyzcjy May 31, 2026
4f2b4c1
Drop None guard in kv_pages so a missing req fails loud
fzyzcjy May 31, 2026
187f44f
Update manual scripted tests to current API surface (option A)
fzyzcjy May 31, 2026
378abc7
Fix Llama tied word embeddings under pipeline parallelism
fzyzcjy May 31, 2026
6aed448
Enable kv-canary only for non-PP scripted runtime servers
fzyzcjy May 31, 2026
cf82cc2
Drain pipeline micro-batches before flushing in per-script reset
fzyzcjy May 31, 2026
06b0716
Add per-request ignore_eos to scripted runtime start_req
fzyzcjy May 31, 2026
5b04f87
Support distinct prompt tokens and SWA lock refs in scripted runtime
fzyzcjy May 31, 2026
452dbd5
Make SWA chunked-req early-return test actually exercise the branch
fzyzcjy May 31, 2026
afc6643
Use Qwen3-0.6B for scripted runtime small model instead of patching L…
fzyzcjy Jun 1, 2026
bb91eae
Assert prefill-twice/decode-once radix hit_count instead of suppressi…
fzyzcjy Jun 1, 2026
097d69e
Assert the cache is really flushed in scripted ctx.flush_cache
fzyzcjy Jun 1, 2026
b04c139
Apply black formatting to scripted ctx api
fzyzcjy Jun 1, 2026
26839be
Assert prefill nodes are hit twice and decode nodes once by node role
fzyzcjy Jun 1, 2026
b3df1f2
Assert scheduler flush_cache result in scripted flush_cache
fzyzcjy Jun 1, 2026
3703d72
Revert "Assert scheduler flush_cache result in scripted flush_cache"
fzyzcjy Jun 1, 2026
efd2797
Remove flush-cache assertion from scripted runtime
fzyzcjy Jun 1, 2026
c70113a
Drive SWA chunked-req early-return via direct KV-pool drain
fzyzcjy Jun 1, 2026
2fe37e9
Merge efd279737c (scripted runtime: prompt_token, ignore_eos, flush-c…
fzyzcjy Jun 1, 2026
22ad99c
Add lock_refs and batch_composition low-hanging scripted-runtime APIs
fzyzcjy Jun 1, 2026
62cbabc
Reproduce SWA chunked-req defer via new_token_ratio jump, not pool drain
fzyzcjy Jun 1, 2026
691c637
Use is_chunking instead of has_pending_chunk in scripted regression t…
fzyzcjy Jun 1, 2026
7ced1c5
Use req.inflight_middle_chunks instead of pending_middle_outputs in m…
fzyzcjy Jun 1, 2026
286c048
Drop finish_event_count assertions; rely on the engine's own double-f…
fzyzcjy Jun 1, 2026
2561d57
Derive chunks_done from an on_run_batch observability hook
fzyzcjy Jun 1, 2026
fd20c46
Use ForwardMode.name.lower() for batch record mode
fzyzcjy Jun 1, 2026
f8e6bad
Record batch mode as None instead of 'unknown' when forward_mode is u…
fzyzcjy Jun 1, 2026
c492526
Merge branch 'tom/scripted_runtime_and_chunked_testing' into tom/scri…
fzyzcjy Jun 1, 2026
2ff58c0
Assert scripted forward_ct advances once per script yield
fzyzcjy Jun 1, 2026
4448e42
Add per-request abort_req helper to scripted runtime context
fzyzcjy Jun 1, 2026
460105f
Run scripted runtime tests with overlap scheduler and cuda graph
fzyzcjy Jun 1, 2026
40e631d
Add pp_size=2 kv-canary e2e baseline and perturb tests
fzyzcjy Jun 1, 2026
9d6ddbc
Enable kv-canary unconditionally in scripted runtime, including PP
fzyzcjy Jun 1, 2026
9193494
Merge tom/scripted_runtime_and_chunked_testing into ..._temp
fzyzcjy Jun 1, 2026
20c79fd
Merge branch 'tom/scripted_runtime_and_chunked_testing_temp' into tom…
fzyzcjy Jun 1, 2026
a5dcdc3
Add VERY_LONG_PROMPT_LEN helper and use public find_req_by_rid
fzyzcjy Jun 1, 2026
ceba6f8
Add scripted-runtime snapshot accessors (status, idle, logprobs)
fzyzcjy Jun 1, 2026
f5938ca
Apply black formatting to touched manual scripted tests
fzyzcjy Jun 1, 2026
1e0dca5
Drop thin Req passthroughs and rename forward_mode accessor
fzyzcjy Jun 1, 2026
cd2c137
Fix on_run_batch extend_rids: is_extend_in_batch is a batch attr, not…
fzyzcjy Jun 1, 2026
79f0600
Reproduce SWA chunked-req early-return defer via real retract churn
fzyzcjy Jun 1, 2026
3a6b52e
Make PP canary fixture run eager and relax sampling-sensitive SWA check
fzyzcjy Jun 1, 2026
aa11ba3
Merge branch 'tom/scripted_runtime_and_chunked_testing' into tom/scri…
fzyzcjy Jun 1, 2026
d53662d
Drive 2 batches for PP SWA divergence instead of relaxing the assertion
fzyzcjy Jun 1, 2026
ea5201d
Rewrite force_retract tests onto real pause_generation(retract) path
fzyzcjy Jun 1, 2026
e220c0f
Detect SWA chunked-req park from batch log instead of scheduler flag
fzyzcjy Jun 1, 2026
b2de384
Add scripted-runtime pool-pressure, evict, and radix-warmup primitives
fzyzcjy Jun 1, 2026
bc26ba4
Drive PP SWA canary past the window and assert peak divergence
fzyzcjy Jun 1, 2026
08c7f4c
Apply black formatting to touched canary unit test
fzyzcjy Jun 1, 2026
770e8b1
Revert "Add scripted-runtime pool-pressure, evict, and radix-warmup p…
fzyzcjy Jun 1, 2026
3a76105
Merge branch 'tom/scripted_runtime_and_chunked_testing' into tom/scri…
fzyzcjy Jun 1, 2026
bb0b8c8
Reapply "Add scripted-runtime pool-pressure, evict, and radix-warmup …
fzyzcjy Jun 1, 2026
6a1b5c8
Pressure the row pool with real ballast requests instead of poking fr…
fzyzcjy Jun 1, 2026
5c83123
Encapsulate KV-pool exhaustion in a ScriptedKvPoolExhauster class
fzyzcjy Jun 1, 2026
9d6632e
more
fzyzcjy Jun 1, 2026
a497908
more
fzyzcjy Jun 1, 2026
3c7823d
Add registered tests for new scripted-runtime harness APIs
fzyzcjy Jun 1, 2026
be38f82
Add stream_events accessor backed by tokenizer recv proxy buffer
fzyzcjy Jun 1, 2026
7ba758a
Add in_flight_other_mb_rids accessor reading running_mbs
fzyzcjy Jun 1, 2026
fdcc461
Add load_inquirer accessors for pending-token tallies
fzyzcjy Jun 1, 2026
3f3caf5
Add disagg kv_send event accessors via on_run_batch start_send_idx sn…
fzyzcjy Jun 1, 2026
c2a0db1
Add eagle draft-input capture accessors via on_run_batch snapshots
fzyzcjy Jun 1, 2026
ad13e0f
Add force_lora_drainer_reject control primitive and lora_path start_r…
fzyzcjy Jun 1, 2026
5e2257d
Rewrite regression probes onto real durable state
fzyzcjy Jun 1, 2026
b2453b1
Delete vacuous cumulative_kv_alloc_bytes probe in kv_pressure test
fzyzcjy Jun 1, 2026
b5fe41b
Rewrite SWA budget/evict probes onto real durable state
fzyzcjy Jun 1, 2026
5776a41
Drop redundant disagg_send_state probe in naive disagg test
fzyzcjy Jun 1, 2026
addee53
Rewrite priority disagg_send_state probes onto real Req fields
fzyzcjy Jun 1, 2026
1554087
Rewrite spec verify/draft-state probes onto real durable state
fzyzcjy Jun 1, 2026
5e0014b
Rewrite hisparse staging probes onto Req.hisparse_staging
fzyzcjy Jun 1, 2026
c1a3778
Rewrite radix priority-skip path probe onto observable consequence
fzyzcjy Jun 1, 2026
527e770
Delete last_chunked_exclude_set_source path probe in special_case
fzyzcjy Jun 1, 2026
3a7f61e
Apply black formatting to scripted-runtime tier34 changes
fzyzcjy Jun 1, 2026
6499e8a
Remove meaningless eagle_*_captured probe from scripted runtime
fzyzcjy Jun 1, 2026
81b3f1c
Remove stream_events accessor and its tokenizer-buffer probe
fzyzcjy Jun 1, 2026
1e4b829
Store origin_input_ids_of_rid dict in batch record instead of prompt_…
fzyzcjy Jun 1, 2026
237f0c5
Remove hisparse scripted test and its hisparse_dma_in_flight accessor
fzyzcjy Jun 1, 2026
46f3ddc
Add scripted-runtime-notes skill for the add-an-API rule
fzyzcjy Jun 1, 2026
587fc97
Broaden scripted-runtime-notes to cover anything scripted-runtime
fzyzcjy Jun 1, 2026
d77f931
Tighten scripted-runtime-notes prose
fzyzcjy Jun 1, 2026
af12dc3
Simplify scripted-runtime trigger to 'anything scripted-runtime related'
fzyzcjy Jun 1, 2026
beb4bf2
Rename scripted-runtime _scheduler attribute to scheduler
fzyzcjy Jun 1, 2026
27e0eee
Apply black formatting after _scheduler rename
fzyzcjy Jun 1, 2026
f380209
Inline single-use scripted-runtime accessors into their tests
fzyzcjy Jun 1, 2026
685f8bd
Drop 'or delete the test' from scripted-runtime-notes anti-pattern
fzyzcjy Jun 1, 2026
852ad69
Rename send_idx_by_rid to send_idx_of_rid dict and drop the comment
fzyzcjy Jun 1, 2026
aadd330
Drop lora drainer scripted test and force_lora_drainer_reject
fzyzcjy Jun 1, 2026
178ad5f
Apply isort blank-line fix after dropping lora test
fzyzcjy Jun 1, 2026
138f8f6
Drop kv_send_events accessors and their batch-record machinery
fzyzcjy Jun 1, 2026
bafedaa
Drop tautological/duplicate scripted invariant tests and fix row_pool…
fzyzcjy Jun 1, 2026
8dd2b30
Replace single-slot chunked tautologies with per-req checks in multi_…
fzyzcjy Jun 1, 2026
6769347
Rebuild force_preempt priority tests on exhaust_kv and drop dead ones
fzyzcjy Jun 1, 2026
a2e6710
Drop tautological special-case tests and tighten chunked observables
fzyzcjy Jun 1, 2026
1d3bcdc
Drop dead/duplicate abort tests and strengthen abort assertions
fzyzcjy Jun 1, 2026
3591ca4
Drop dead regression tests and fold inflight-middle-chunks regression
fzyzcjy Jun 1, 2026
a27b96e
Drop unreachable piecewise-CG tests and tighten tail-chunk count
fzyzcjy Jun 1, 2026
8eb018c
Drop dead PP chunked tests and tighten microbatch assertions
fzyzcjy Jun 1, 2026
526a82e
Drop duplicate/erroring spec chunked tests and tighten handoff assert…
fzyzcjy Jun 1, 2026
ae103fe
Tighten chunk-size boundary asserts to ==0 and add page+1 boundary test
fzyzcjy Jun 1, 2026
ca07bdf
Drop duplicate KV-pressure tests, force real retract gate, fix priori…
fzyzcjy Jun 1, 2026
e683412
Drop duplicate lifecycle test and tighten chunks_done boundary asserts
fzyzcjy Jun 1, 2026
cc00a75
Fix always-false LoRA id assertions in chunked lora tests
fzyzcjy Jun 1, 2026
43fb008
Make radix policy/priority chunked tests observable and fix priority …
fzyzcjy Jun 1, 2026
60980ff
Strengthen disagg/http-smoke/swa chunked observables and drop duplicate
fzyzcjy Jun 1, 2026
74637e1
Add max_new_tokens decode-forward divergence scripted tests
fzyzcjy Jun 1, 2026
736928c
Add mixed-chunk prefill budget test under enable_mixed_chunk
fzyzcjy Jun 1, 2026
dc09e5b
Fix string-valued priority kwargs in regression chunked test
fzyzcjy Jun 1, 2026
2a6b82b
Fix ScriptedReqHandle construction in abort unknown-rid test
fzyzcjy Jun 1, 2026
7c93fcb
Apply black/isort formatting to chunked scripted tests
fzyzcjy Jun 1, 2026
d32afce
Drop runtime-broken sampler-kwarg tests and fix finished_reason checks
fzyzcjy Jun 1, 2026
3972571
Default scripted runtime to the non-overlap scheduler
fzyzcjy Jun 1, 2026
7016c11
Count the completing chunk in chunks_done
fzyzcjy Jun 1, 2026
0eb7ce0
Report zero kv_pages once a req releases its row
fzyzcjy Jun 1, 2026
259f5a2
Fix exhaust_row_pool scheduler attr and ballast context overflow
fzyzcjy Jun 1, 2026
30d0dfd
Fix scripted_runtime_core scheduler refs, flush ordering, decode targets
fzyzcjy Jun 1, 2026
3d39694
Fix ScriptedReqHandle construction in manual abort/invariants tests
fzyzcjy Jun 1, 2026
f180498
Revert "Default scripted runtime to the non-overlap scheduler"
fzyzcjy Jun 1, 2026
a51f87f
Make chunks_done exact mid-flight, not just after completion
fzyzcjy Jun 1, 2026
867913f
Document that the scripted runtime requires the overlap scheduler
fzyzcjy Jun 1, 2026
6b36d3f
Uncap the BackgroundHttpPoster connection pool
fzyzcjy Jun 1, 2026
85b75b7
Account for the overlap pipeline in pause(retract) core test
fzyzcjy Jun 1, 2026
372df05
Snapshot pause(retract) output after the overlap forward drains
fzyzcjy Jun 1, 2026
7776bd2
Strengthen sustained-load and long-decode invariant tests with real o…
fzyzcjy Jun 1, 2026
815eaee
Fix chunk-size-1 monotone tautology and add max-prefill-boundary test
fzyzcjy Jun 1, 2026
ff1f8bd
Fix piecewise tail-chunk count off-by-one (chunks_done 4 to 5)
fzyzcjy Jun 1, 2026
3a3b9dc
Tighten disagg/http-smoke/swa chunk-count asserts and drop contradict…
fzyzcjy Jun 1, 2026
1f0f9ba
Drop broken lock-ref test calling nonexistent exhaust_lock_refs
fzyzcjy Jun 1, 2026
6927845
Add retract-side Stage-A parked-chunk regression test
fzyzcjy Jun 1, 2026
5e674e3
Tighten lifecycle chunk-count asserts from >=2 to exact ceil values
fzyzcjy Jun 1, 2026
87bd5a0
Accept overlap finish at the last_decode pause stage
fzyzcjy Jun 1, 2026
389d427
Tighten deterministic lora chunk-count asserts and fix vacuous H2D-ab…
fzyzcjy Jun 1, 2026
d5cc957
Apply black formatting to round-2 chunked test fixes
fzyzcjy Jun 1, 2026
91b57fa
Detect the chunked-req park across all SWA candidates
fzyzcjy Jun 1, 2026
f4bc61e
Add scripted page_size>1 tests and dedup page cases from chunk_size
fzyzcjy Jun 1, 2026
3532003
Add page_size>1 e2e accuracy test
fzyzcjy Jun 1, 2026
b772f04
Add exhaust_lock_refs harness primitive and restore lock-ref chunked …
fzyzcjy Jun 1, 2026
51046cd
Park (not finish) at the last_decode pause stage under overlap
fzyzcjy Jun 1, 2026
4e11770
Merge branch 'tom/scripted_runtime_and_chunked_testing' into tom/scri…
fzyzcjy Jun 1, 2026
349b497
Merge branch 'tom/scripted_runtime_and_chunked_testing_temp' into tom…
fzyzcjy Jun 1, 2026
29bd8eb
Handle the inevitable last_decode finish inline instead of restructur…
fzyzcjy Jun 1, 2026
2399517
Add source-branch coverage tests for chunked special-cases (checklist c)
fzyzcjy Jun 1, 2026
95f3aa1
Add trigger_abort_on_waiting_timeout harness primitive
fzyzcjy Jun 1, 2026
06f4bca
Replace nonexistent trigger_abort call; assert real waiting-timeout s…
fzyzcjy Jun 1, 2026
cbfc6cf
Replace nonexistent retract_all/pause_retract_all with pause_generati…
fzyzcjy Jun 1, 2026
7f8290f
Remap shutdown-during-chunked test to abort_all
fzyzcjy Jun 1, 2026
59ad8e6
Apply black formatting to chunked coverage tests
fzyzcjy Jun 1, 2026
3b15001
Drop direct-call timeout primitive; drive waiting-timeout sweep via r…
fzyzcjy Jun 1, 2026
0e4c5df
Merge branch 'tom/scripted_runtime_and_chunked_testing_temp' into tom…
fzyzcjy Jun 1, 2026
6ade010
Note real-path-only rule for control primitives in scripted-runtime s…
fzyzcjy Jun 1, 2026
10a0f19
Add Other Tips section to scripted-runtime skill
fzyzcjy Jun 1, 2026
6308d60
Trim Other Tips to the real-path control-primitive rule
fzyzcjy Jun 1, 2026
ec49263
Fix multi_req KV-leak measurement and identical-prompt chunk counts
fzyzcjy Jun 1, 2026
46c3db4
Give each req in the chunk-boundary sweep a distinct prompt token
fzyzcjy Jun 1, 2026
b381a79
Fix TestRadixBasic eviction-drain and chunk-boundary expectations
fzyzcjy Jun 2, 2026
97cfc2e
Merge branch 'main-upstream' into tom/scripted_runtime_and_chunked_te…
fzyzcjy Jun 2, 2026
141c6e2
Make the scripted lock_refs probe SWA-aware and null-safe
fzyzcjy Jun 2, 2026
e4b9e04
Drain the overlap pipeline before asserting abort released resources
fzyzcjy Jun 2, 2026
953bd4b
Fix radix policy and lifecycle sequence test assumptions
fzyzcjy Jun 2, 2026
7a9b214
Apply black formatting to abort drain edits
fzyzcjy Jun 2, 2026
a24924c
Drop v2/dual-queue chunked-prefill tests and assumptions
fzyzcjy Jun 2, 2026
c864d67
Fix TestPPBasic abort-drain and identical-prompt chunk counts
fzyzcjy Jun 2, 2026
ed020ee
Apply overlap-drain, r.req-None guards, and distinct prompts to regre…
fzyzcjy Jun 2, 2026
7601b1f
Apply formatting from pre-commit
fzyzcjy Jun 2, 2026
e485564
Fix PP scripted-runtime tests to match real v1 engine behavior
fzyzcjy Jun 2, 2026
5c518b7
Fix TestRegressionPp scripted tests to match real PP engine behavior
fzyzcjy Jun 2, 2026
3f9e255
Fix SWA scripted-runtime budget tests to match real v1 behavior
fzyzcjy Jun 2, 2026
04304bd
Fix radix scripted-runtime priority/partial-page/disabled tests for v1
fzyzcjy Jun 2, 2026
e8eb94d
Let the scripted harness launch a standalone disagg-prefill server wi…
fzyzcjy Jun 2, 2026
cd81386
Fix nonoverlap disagg retract test to match real chunked_req behavior
fzyzcjy Jun 2, 2026
38c6da7
Fix regression scripted-runtime basic/priority/lpm tests for v1
fzyzcjy Jun 2, 2026
0b28725
Apply formatting from pre-commit
fzyzcjy Jun 2, 2026
61eed00
Fix disagg inflight-queue test for fake transfer observability
fzyzcjy Jun 2, 2026
237b232
Fix disagg partial-page test config and observation order
fzyzcjy Jun 2, 2026
aa8a849
Fix most special_case scripted tests for v1 (distinct prompts, drains…
fzyzcjy Jun 2, 2026
90dec59
Fix four TestSpecialCaseBasic scripted tests to match real v1 behavior
fzyzcjy Jun 2, 2026
8926b68
Apply black reformat after special_case v1 test fixes
fzyzcjy Jun 2, 2026
78be392
Drain overlap pipeline before abort-clears-chunked-slot assert
fzyzcjy Jun 2, 2026
2b255d7
Apply black formatting
fzyzcjy Jun 2, 2026
1a6ee32
Fix init_next_round fill_ids invariant to prefix plus in-flight chunk
fzyzcjy Jun 2, 2026
7d57cae
Apply black formatting
fzyzcjy Jun 2, 2026
cfb0e74
Fix ChunkedRemReadd rem-nonpositive test to assert the real force-re-…
fzyzcjy Jun 2, 2026
9bad8d3
Move row-pool-exhaustion bypass test to a small-max_running_requests …
fzyzcjy Jun 2, 2026
8ead099
Land the abort-during-gap test at a reachable mid-chunk state
fzyzcjy Jun 2, 2026
ee0b68f
Add kv_canary PP self-test fixture and SWA divergence coverage
fzyzcjy Jun 2, 2026
43f0b63
Add scripted-runtime harness core and wire scheduler/IPC hooks
fzyzcjy Jun 2, 2026
1cfadc2
Add scripted-runtime KV-pool and lock-ref exhauster primitives
fzyzcjy Jun 2, 2026
3501f01
Add scripted-runtime unit, core integration, and chunked-prefill tests
fzyzcjy Jun 2, 2026
e00be15
Remove scripted disagg tests and revert the fake-prefill SRT relaxation
fzyzcjy Jun 5, 2026
11bc498
Rename abort-mid-chunk radix test to match what it actually asserts
fzyzcjy Jun 5, 2026
c7bdeb8
Add PD x PP x chunked-prefill end-to-end test
fzyzcjy Jun 5, 2026
57513a3
Add PD x PP x chunked e2e test alongside the chunked_prefill e2e suite
fzyzcjy Jun 5, 2026
0b9c47b
Merge branch 'main-upstream' into tom/scripted_runtime_and_chunked_te…
fzyzcjy Jun 5, 2026
16a2b65
Merge sgl-project PR #26991 (scripted_runtime_extracted_chain) into t…
fzyzcjy Jun 5, 2026
9f368d5
Apply black formatting to the merged core_1gpu finished-case assertion
fzyzcjy Jun 5, 2026
574c03c
Fix scripted-test observation bugs: probe-all finish waits, None-safe…
fzyzcjy Jun 5, 2026
84e654b
Correct decode-forward-count law for the overlap scheduler
fzyzcjy Jun 5, 2026
7b90b00
Point LoRA and EAGLE scripted tests at models that exist and valid po…
fzyzcjy Jun 5, 2026
f2875c4
Drop remaining single-server disagg scripted tests; fix retract-abort…
fzyzcjy Jun 5, 2026
4664fa0
Make KV-pressure tests use retractable peers and valid configs
fzyzcjy Jun 5, 2026
57c872d
Apply black formatting to scripted test fixes
fzyzcjy Jun 5, 2026
308fce8
Restore DETERMINISTIC_ALIGN_SIZE swallowed by the TinyChunk class del…
fzyzcjy Jun 5, 2026
2265d52
Fix remaining manual scripted-test failures from the H200 rerun (roun…
fzyzcjy Jun 5, 2026
2edb7cf
Round 3: honest pressure pattern, pause-leak fix, co-batch premise fixes
fzyzcjy Jun 5, 2026
6e062e0
Apply black formatting to round-3 fixes
fzyzcjy Jun 5, 2026
3cc45ef
Freeze chunks_done only after release in the abort-mid-chunk radix test
fzyzcjy Jun 5, 2026
c2548be
Latch the chunked+decode coexistence window; add rotation chunk diagn…
fzyzcjy Jun 5, 2026
b7763c0
De-alias per-script prompt tokens; engine-native small-KV-pool pressu…
fzyzcjy Jun 5, 2026
3404574
Per-adapter pair assertion for LoRA rotation; move chunked+decode coe…
fzyzcjy Jun 5, 2026
bcaed43
Keep a positive page floor under raw KV exhaustion; drain warm lock_refs
fzyzcjy Jun 5, 2026
f551ab2
Remove stale twin asserts after the pair reformulation; accept mixed-…
fzyzcjy Jun 5, 2026
f632d14
Replace the last raw-exhaustion pressure with engine-native ballast p…
fzyzcjy Jun 5, 2026
98ef65c
Adaptively top up ballast pressure until the 16-token admission gate …
fzyzcjy Jun 5, 2026
bc3c10b
Make kv-recovery ballast prompts decisively close the admission gate
fzyzcjy Jun 5, 2026
ac9d8e8
Rebuild kv-recovery pressure on the proven ballast+chunked regime
fzyzcjy Jun 5, 2026
5cb8847
Fix e2e chunked suite: real kv-canary CLI flags and base-class collec…
fzyzcjy Jun 5, 2026
da8fe11
E2E canary pairing/opt-outs and a gsm8k-capable LoRA setup
fzyzcjy Jun 5, 2026
b42ebb6
Replace the stale Llama-1-era second LoRA adapter in the e2e pair
fzyzcjy Jun 5, 2026
5f45cb1
Calibrate Llama-3.2-1B e2e thresholds, opt spec out of canary, drop t…
fzyzcjy Jun 5, 2026
c19fb6c
Switch the spec e2e to the proven EAGLE3 stack
fzyzcjy Jun 5, 2026
a80e474
Merge the coexistence test into the existing mixed-chunk class
fzyzcjy Jun 5, 2026
b67412e
Merge branch 'main-upstream' into tom/scripted_runtime_and_chunked_te…
fzyzcjy Jun 6, 2026
cfa3f15
Remove the pause_generation mode assert; abort legitimately reaches t…
fzyzcjy Jun 6, 2026
2244491
Merge branch 'main-upstream' into tom/scripted_runtime_and_chunked_te…
fzyzcjy Jun 6, 2026
fb917d0
Minimize comment-only divergence from upstream in shared test files
fzyzcjy Jun 6, 2026
a81e0b6
Drop dead eval surface and deduplicate harness internals
fzyzcjy Jun 6, 2026
14de978
Fold the chunked e2e tester into a single Gsm8k mixin
fzyzcjy Jun 6, 2026
e5fb69c
Merge remote-tracking branch 'upstream/main' into tom/scripted_runtim…
fzyzcjy Jun 7, 2026
31c3964
Drop non-essential edits to upstream-landed registered tests and eval…
fzyzcjy Jun 7, 2026
d3090fb
Keep SingleEvalResult return on one line (match upstream)
fzyzcjy Jun 7, 2026
8d2af96
Add mixed-prefix gsm8k eval and its CPU unit test
fzyzcjy Jun 7, 2026
27fbff8
Merge mixed-prefix gsm8k eval (#27502) as this PR's base
fzyzcjy Jun 7, 2026
9ebf28e
more
fzyzcjy Jun 7, 2026
e6b8b18
Strip all comments and docstrings added by this PR
fzyzcjy Jun 7, 2026
cd5573b
Merge comment/docstring strip for chunked-testing PR
fzyzcjy Jun 7, 2026
015200a
Conditionally exclude in-flight other-mb chunked-resume reqs (PP, max…
fzyzcjy May 14, 2026
19e2353
Refine PP cross-mb in-flight exclusion to last-chunk EXTEND reqs only
fzyzcjy Jun 7, 2026
a59441f
Detect finished reqs via the batch log when a short-circuit skips the…
fzyzcjy Jun 7, 2026
722fd4b
Cap num_examples by slicing instead of asserting
fzyzcjy Jun 7, 2026
7c3b7b3
Merge assert-fix from mixed_prefix_gsm8k_eval (#27502)
fzyzcjy Jun 7, 2026
a3996e1
Merge branch 'main-upstream' into tom/mixed_prefix_gsm8k_eval
fzyzcjy Jun 8, 2026
c1d8ff1
Merge latest main (via #27502) into this branch — picks up #27512 mim…
fzyzcjy Jun 8, 2026
4eb5bc6
Remove PP cross-mb in-flight chunked-exclude fix and its tautological…
fzyzcjy Jun 8, 2026
8c8f08d
Merge branch 'main-upstream' into tom/mixed_prefix_gsm8k_eval
fzyzcjy Jun 8, 2026
f240a51
Merge tom/mixed_prefix_gsm8k_eval (latest main) into tom/scripted_run…
fzyzcjy Jun 8, 2026
9b01721
Merge branch 'main-upstream' into tom/mixed_prefix_gsm8k_eval
fzyzcjy Jun 8, 2026
17f139b
Merge tom/mixed_prefix_gsm8k_eval (latest main) into tom/scripted_run…
fzyzcjy Jun 8, 2026
6e4b4dd
Merge remote-tracking branch 'upstream/main' into tom/scripted_runtim…
fzyzcjy Jun 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions python/sglang/test/chunked_prefill_test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from __future__ import annotations

import time
from types import SimpleNamespace
from typing import ClassVar, List, Optional

from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.server_fixtures.disaggregation_fixture import (
PDDisaggregationServerBase,
)
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
try_cached_model,
)

DEFAULT_MODEL: str = "Qwen/Qwen3-0.6B"

DEFAULT_CHUNKED_PREFILL_SIZE: int = 256
DEFAULT_NUM_EXAMPLES: int = 100
DEFAULT_NUM_SHOTS: int = 10
LONG_PROMPT_NUM_SHOTS: int = 24
DEFAULT_NUM_THREADS: int = 128
DEFAULT_MAX_TOKENS: int = 512
DEFAULT_SEED: int = 42

KV_CANARY_ARGS: List[str] = [
"--kv-canary",
"raise",
"--kv-canary-real-data",
"partial",
"--kv-canary-sweep-interval",
"100",
"--disable-piecewise-cuda-graph",
]


class ChunkedGsm8kMixin:
__test__ = False
use_kv_canary: ClassVar[bool] = True
model: ClassVar[str] = DEFAULT_MODEL
feature_args: ClassVar[List[str]] = []

chunked_prefill_size: ClassVar[int] = DEFAULT_CHUNKED_PREFILL_SIZE
num_shots: ClassVar[int] = DEFAULT_NUM_SHOTS
num_examples: ClassVar[int] = DEFAULT_NUM_EXAMPLES
num_threads: ClassVar[int] = DEFAULT_NUM_THREADS
max_tokens: ClassVar[int] = DEFAULT_MAX_TOKENS
gsm8k_threshold: ClassVar[float]

def build_prefill_side_args(self) -> List[str]:
canary = list(KV_CANARY_ARGS) if self.use_kv_canary else []
return (
["--chunked-prefill-size", str(self.chunked_prefill_size)]
+ list(self.feature_args)
+ canary
)
Comment on lines +54 to +60

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Instantiating the test case class manually inside setUpClass just to call build_prefill_side_args is an anti-pattern. Since build_prefill_side_args only accesses class-level variables, it should be defined as a @classmethod.

    @classmethod\n    def build_prefill_side_args(cls) -> List[str]:\n        canary = list(KV_CANARY_ARGS) if cls.use_kv_canary else []\n        return (\n            [\"--chunked-prefill-size\", str(cls.chunked_prefill_size)]\n            + list(cls.feature_args)\n            + canary\n        )


def test_mixed_prefix_gsm8k_chunked(self):
fixture_name = type(self).__name__

args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mixed_prefix_gsm8k",
api="chat_completion",
max_tokens=self.max_tokens,
num_examples=self.num_examples,
num_threads=self.num_threads,
num_shots=self.num_shots,
mixed_prefix_gsm8k_secondary_pool_size=15,
mixed_prefix_gsm8k_seed=DEFAULT_SEED,
gsm8k_data_path=None,
temperature=0.0,
)
tic = time.perf_counter()
metrics = run_eval(args)
metrics["elapsed_sec"] = time.perf_counter() - tic
print(f"[{fixture_name}] {metrics} threshold={self.gsm8k_threshold:.4f}")

score = metrics.get("score")
self.assertIsNotNone(score, "run_eval returned no score")
self.assertGreaterEqual(score, self.gsm8k_threshold)


class ChunkedTestBase(ChunkedGsm8kMixin, CustomTestCase):
__test__ = False

base_url: ClassVar[str] = DEFAULT_URL_FOR_TEST
launch_timeout: ClassVar[int] = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH

process: ClassVar[Optional[object]] = None

@classmethod
def setUpClass(cls):
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=cls.launch_timeout,
other_args=cls("test_mixed_prefix_gsm8k_chunked").build_prefill_side_args(),
)
Comment on lines +98 to +104

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Call the refactored classmethod build_prefill_side_args directly on cls instead of instantiating the test case class.

    @classmethod\n    def setUpClass(cls):\n        cls.process = popen_launch_server(\n            cls.model,\n            cls.base_url,\n            timeout=cls.launch_timeout,\n            other_args=cls.build_prefill_side_args(),\n        )


@classmethod
def tearDownClass(cls):
if cls.process is not None:
kill_process_tree(cls.process.pid)


class ChunkedTestPDBase(ChunkedGsm8kMixin, PDDisaggregationServerBase):
__test__ = False
decode_feature_args: ClassVar[List[str]] = []

@classmethod
def setUpClass(cls):
cls.extra_prefill_args = cls(
"test_mixed_prefix_gsm8k_chunked"
).build_prefill_side_args()
Comment on lines +117 to +120

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Call the refactored classmethod build_prefill_side_args directly on cls instead of instantiating the test case class.

    @classmethod\n    def setUpClass(cls):\n        cls.extra_prefill_args = cls.build_prefill_side_args()

canary = list(KV_CANARY_ARGS) if cls.use_kv_canary else []
cls.extra_decode_args = canary + list(cls.decode_feature_args)
PDDisaggregationServerBase.setUpClass()
cls.model = try_cached_model(cls.model)
cls.launch_all()

@classmethod
def tearDownClass(cls):
PDDisaggregationServerBase.tearDownClass()
8 changes: 6 additions & 2 deletions python/sglang/test/scripted_runtime/context/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def start_req(
return_logprob: bool = False,
logprob_start_len: Optional[int] = None,
top_logprobs_num: Optional[int] = None,
stop_token_ids: Optional[List[int]] = None,
temperature: Optional[float] = None,
lora_path: Optional[str] = None,
) -> "ScriptedReqHandle":
return self._req_starter.start_req(
Expand All @@ -77,6 +79,8 @@ def start_req(
return_logprob=return_logprob,
logprob_start_len=logprob_start_len,
top_logprobs_num=top_logprobs_num,
stop_token_ids=stop_token_ids,
temperature=temperature,
lora_path=lora_path,
)

Expand All @@ -89,8 +93,8 @@ def continue_generation(self, *, torch_empty_cache: bool = False) -> None:
def abort_all(self) -> None:
return lifecycle.abort_all(self)

def abort(self, handle: "ScriptedReqHandle") -> None:
return lifecycle.abort(self, rid=handle.rid)
def abort(self, handle: "ScriptedReqHandle", *, await_arrival: bool = True) -> None:
return lifecycle.abort(self, rid=handle.rid, await_arrival=await_arrival)

def flush_cache(self) -> None:
return lifecycle.flush_cache(self)
Expand Down
33 changes: 24 additions & 9 deletions python/sglang/test/scripted_runtime/context/http_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,34 @@ def _http_post_and_await_recv_msg(
predicate: Callable[[Any], bool],
description: str,
timeout_s: float = RECV_MSG_ARRIVAL_TIMEOUT_S,
) -> None:
_submit_post(ctx, path=path, json=json)
ctx._tokenizer_recv_proxy.wait_until_arrived(
predicate,
timeout_s=timeout_s,
description=description,
)


def _http_post_fire_and_forget(
ctx: "ScriptedContext",
*,
path: str,
json: Optional[Dict[str, Any]],
) -> None:
_submit_post(ctx, path=path, json=json)


def _submit_post(
ctx: "ScriptedContext",
*,
path: str,
json: Optional[Dict[str, Any]],
) -> None:
server_args = ctx.scheduler.server_args
url = f"http://{server_args.host}:{server_args.port}{path}"

async def _post() -> None:
try:
await ctx._http_poster.post(url, json)
except Exception: # noqa: BLE001 — fire-and-forget background POST
logger.exception("scripted_runtime: POST %s failed", path)
await ctx._http_poster.post(url, json)
Comment on lines 49 to +50

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Removing the try-except block from the background fire-and-forget POST task can lead to unhandled exceptions if the HTTP request fails (e.g., due to network issues or server crashes). This can cause silent failures or crash the event loop depending on how submit_coro handles background task exceptions. It is safer to keep the exception logging.

    async def _post() -> None:\n        try:\n            await ctx._http_poster.post(url, json)\n        except Exception:\n            logger.exception(\"scripted_runtime: POST %s failed\", path)


ctx._http_poster.submit_coro(_post())
ctx._tokenizer_recv_proxy.wait_until_arrived(
predicate,
timeout_s=timeout_s,
description=description,
)
14 changes: 12 additions & 2 deletions python/sglang/test/scripted_runtime/context/lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,24 @@
)
from sglang.test.scripted_runtime.context.http_post import (
_http_post_and_await_recv_msg,
_http_post_fire_and_forget,
)

if TYPE_CHECKING:
from sglang.test.scripted_runtime.context.api import ScriptedContext


def _await_control(
ctx: "ScriptedContext", *, path: str, json, expect_type: type
ctx: "ScriptedContext",
*,
path: str,
json,
expect_type: type,
await_arrival: bool = True,
) -> None:
if not await_arrival:
_http_post_fire_and_forget(ctx, path=path, json=json)
return
_http_post_and_await_recv_msg(
ctx,
path=path,
Expand Down Expand Up @@ -57,12 +66,13 @@ def abort_all(ctx: "ScriptedContext") -> None:
)


def abort(ctx: "ScriptedContext", *, rid: str) -> None:
def abort(ctx: "ScriptedContext", *, rid: str, await_arrival: bool = True) -> None:
_await_control(
ctx,
path="/abort_request",
json={"rid": rid, "abort_all": False},
expect_type=AbortReq,
await_arrival=await_arrival,
)


Expand Down
17 changes: 14 additions & 3 deletions python/sglang/test/scripted_runtime/context/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,20 @@ def find_req_by_rid(ctx: "ScriptedContext", rid: str) -> Optional["Req"]:

def is_finished(ctx: "ScriptedContext", rid: str) -> bool:
req = find_req_by_rid(ctx, rid)
if req is None:
return rid in ctx._seen_rids
return req.finished()
if req is not None:
return req.finished()
if rid in ctx._seen_rids:
return True
# Fallback: if the req ran in a forward batch (recorded in _batch_log) but
# is now absent from all active scheduler sets, it must have finished.
# This catches requests that completed without ever being observed via
# find_req_by_rid (e.g. when Python short-circuit evaluation prevents the
# query while another request is still running).
log = ctx._scheduler_hook._batch_log
if any(rid in record.rids for record in log):
ctx._seen_rids.add(rid)
return True
return False


def is_chunking(ctx: "ScriptedContext", rid: str) -> bool:
Expand Down
8 changes: 7 additions & 1 deletion python/sglang/test/scripted_runtime/context/req_starter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import uuid
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING, List, Optional

from sglang.test.scripted_runtime.context.http_post import (
_http_post_and_await_recv_msg,
Expand Down Expand Up @@ -30,6 +30,8 @@ def start_req(
return_logprob: bool = False,
logprob_start_len: Optional[int] = None,
top_logprobs_num: Optional[int] = None,
stop_token_ids: Optional[List[int]] = None,
temperature: Optional[float] = None,
lora_path: Optional[str] = None,
) -> ScriptedReqHandle:
ctx = self._ctx
Expand All @@ -39,6 +41,10 @@ def start_req(
self._req_counter += 1

sampling_params = {"max_new_tokens": max_new_tokens, "ignore_eos": ignore_eos}
if stop_token_ids is not None:
sampling_params["stop_token_ids"] = stop_token_ids
if temperature is not None:
sampling_params["temperature"] = temperature
payload = {
"input_ids": [prompt_token] * prompt_len,
"sampling_params": sampling_params,
Expand Down
11 changes: 9 additions & 2 deletions python/sglang/test/scripted_runtime/req_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional

from sglang.test.scripted_runtime.context.radix import _node_lock_ref

if TYPE_CHECKING:
from sglang.srt.managers.schedule_batch import Req
from sglang.test.scripted_runtime.context.api import ScriptedContext
Expand Down Expand Up @@ -47,5 +49,10 @@ def kv_pages(self) -> int:

@property
def lock_refs(self) -> int:
node = self.req.last_node
return node.lock_ref if node is not None else 0
req = self.req
if req is None:
return 0
node = req.last_node
if node is None:
return 0
return _node_lock_ref(node)
3 changes: 3 additions & 0 deletions python/sglang/test/scripted_runtime/scheduler_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ def _drive_engine_through_warmup(ctx: ScriptedContext) -> Generator:
def _reset_engine_state(ctx: ScriptedContext) -> Generator:
scheduler = ctx.scheduler

if scheduler._engine_paused:
ctx.continue_generation()

ctx._release_exhausted_pools()
ctx.abort_all()
for _ in range(RESET_DRAIN_MAX_STEPS):
Expand Down
13 changes: 11 additions & 2 deletions python/sglang/test/scripted_runtime_chunked_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,16 @@ def run_until_finished(handle, *, max_steps: int = DEFAULT_MAX_STEPS):


def run_until_all_finished(handles: List[Any], *, max_steps: int = DEFAULT_MAX_STEPS):
done = [False] * len(handles)
for _ in range(max_steps):
if all(h.finished for h in handles):
for i, h in enumerate(handles):
done[i] = done[i] or h.finished
if all(done):
return
yield
raise AssertionError(
f"run_until_all_finished: not all reqs finished after {max_steps} "
f"steps (finished={[h.finished for h in handles]})"
f"steps (finished={done})"
)


Expand All @@ -65,6 +68,12 @@ def warmup_radix(t, prompt_tokens: List[int], *, max_steps: int = DEFAULT_MAX_ST

BALLAST_MAX_NEW_TOKENS: int = 30000

SMALL_KV_POOL_MAX_TOTAL_TOKENS: int = 4096

SMALL_KV_POOL_BALLAST_MAX_NEW_TOKENS: int = 512

SMALL_KV_POOL_BALLAST_PROMPT_LEN: int = 1536


def exhaust_row_pool(t, *, leave_rows: int, max_steps: int = DEFAULT_MAX_STEPS):
target: int = t.scheduler.req_to_token_pool.available_size() - leave_rows
Expand Down
Empty file.
13 changes: 13 additions & 0 deletions test/manual/chunked_prefill/test_e2e_disagg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import unittest

from sglang.test.chunked_prefill_test_utils import ChunkedTestPDBase


class TestChunkedFeatureDisagg(ChunkedTestPDBase):
__test__ = True
use_kv_canary = False
gsm8k_threshold = 0.50


if __name__ == "__main__":
unittest.main()
23 changes: 23 additions & 0 deletions test/manual/chunked_prefill/test_e2e_dp_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import unittest

from sglang.test.chunked_prefill_test_utils import ChunkedTestBase
from sglang.test.test_utils import DEFAULT_MLA_MODEL_NAME_FOR_TEST


class TestChunkedFeatureDPAttention(ChunkedTestBase):
__test__ = True
use_kv_canary = False
model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
gsm8k_threshold = 0.50
feature_args = [
"--trust-remote-code",
"--tp",
"2",
"--enable-dp-attention",
"--dp",
"2",
]


if __name__ == "__main__":
unittest.main()
Loading
Loading