sgl-project · fzyzcjy · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
@@ -59,7 +59,7 @@
 )
 from sglang.srt.environ import envs
 from sglang.srt.layers.dp_attention import get_attention_tp_size
-from sglang.srt.managers.schedule_batch import FINISH_ABORT, ScheduleBatch
+from sglang.srt.managers.schedule_batch import FINISH_ABORT, ReqPhase, ScheduleBatch
 from sglang.srt.managers.schedule_policy import match_prefix_for_req
 from sglang.srt.managers.utils import GenerationBatchResult
 from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
@@ -79,6 +79,7 @@
     ReqToTokenPool,
 )
 from sglang.srt.mem_cache.swa_memory_pool import SWAKVPool
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.observability.req_time_stats import (
     set_schedule_time_batch,
     set_time_batch,
@@ -154,9 +155,8 @@ def alloc(self, reqs: List[Req]) -> Optional[List[int]]:
             len(reusing) <= 1
         ), "only one chunked request may reuse req_pool_idx in a batch"
         assert all(
-            reqs[i].inflight_middle_chunks > 0 or reqs[i].kv_committed_len > 0
-            for i in reusing
-        ), "reusing request must be chunked or have committed KV"
+            reqs[i].kv_committed_len > 0 for i in reusing
+        ), "reusing request must have committed KV"
 
         need_size = len(reqs) - len(reusing)
         if need_size > len(self.free_slots):
@@ -1396,7 +1396,7 @@ def _pre_alloc(
             kv_loc,
         )
 
-        # Truncate fill_len to kv_committed_len so cache_unfinished_req only
+        # Truncate extend_fill_len to kv_committed_len so cache_unfinished_req only
         # inserts committed KV into the radix tree. The last output token
         # hasn't had KV committed yet (output_ids is 1 ahead).
         # Set prefix_indices so downstream consumers (init_next_round_input,
@@ -1406,7 +1406,16 @@ def _pre_alloc(
         req.prefix_indices = (
             prefix_indices if prefix_len > 0 else torch.empty((0,), dtype=torch.int64)
         )
+        # TODO: start can transiently disagree with len(prefix_indices) under HiCache
+        # decode prefetch, but it is behavior-neutral — only .end is read before
+        # get_new_prebuilt_batch resets extend_range ahead of the prebuilt forward.
         req.set_extend_range(total_prefix_len, req.kv_committed_len)
+        # This prebuilt path never goes through the PrefillAdder, so enter
+        # the extend phase here; prepare_for_decode moves it to DECODE later.
+        # These reqs are not a real chunk sequence; NON_LAST keeps them visible
+        # as holders of not-yet-batched extend resources (pool stats / invariant
+        # checker) until prepare_for_decode flips them to DECODE.
+        req.phase = ReqPhase.EXTEND_NON_LAST
 
         # Return the transfer destination indices:
         if self.scheduler.enable_hisparse:
@@ -1805,11 +1814,13 @@ def get_next_disagg_decode_batch_to_run(
         # Process pending prebuilt batch: output processing + filter + merge
         new_prebuilt_batch = self.get_new_prebuilt_batch()
         if new_prebuilt_batch:
-            assert self.chunked_req is None
             self.batch_result_processor.process_batch_result_prebuilt(
                 new_prebuilt_batch
             )
-            new_prebuilt_batch.filter_batch()
+            is_extend_intermediate = new_prebuilt_batch.is_extend_intermediate or []
+            assert not any(
+                is_extend_intermediate
+            ), "prebuilt batch carries intermediate-mode reqs"
             if not new_prebuilt_batch.is_empty():
                 if self.running_batch.is_empty():
                     self.running_batch = new_prebuilt_batch
@@ -1860,6 +1871,7 @@ def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
             # we can only add at least `num_not_used_batch` new batch to the running queue
             if i < num_not_used_batch:
                 can_run_list.append(req)
+                self._activate_req(req)
                 # Decode-radix path: new requests already matched in
                 # `pop_preallocated`. Retracted requests reset `last_node`,
                 # so re-match only when that state is missing.
@@ -1868,11 +1880,15 @@ def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
                 else:
                     tree_cache = self.tree_cache
                 req.init_next_round_input(tree_cache)
-                # Truncate fill_len to kv_committed_len so cache_unfinished_req
+                # Truncate extend_fill_len to kv_committed_len so cache_unfinished_req
                 # only sees committed KV (full array includes one uncommitted
                 # token because init_next_round_input rebuilt it as full).
                 if req.kv_committed_len is not None:
                     req.set_extend_range(len(req.prefix_indices), req.kv_committed_len)
+                    # This prebuilt path never goes through the PrefillAdder,
+                    # so enter the extend phase here; prepare_for_decode moves
+                    # it to DECODE later.
+                    req.phase = ReqPhase.EXTEND_NON_LAST
             else:
                 waiting_queue.append(req)
 
@@ -1891,6 +1907,7 @@ def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
             self.model_config,
             self.enable_overlap,
             self.spec_algorithm,
+            forward_mode=ForwardMode.PREBUILT,
         )
 
         # construct fake completed prefill

@@ -50,6 +50,7 @@
     FINISH_ABORT,
     FINISH_LENGTH,
     Req,
+    ReqPhase,
     ScheduleBatch,
 )
 from sglang.srt.mem_cache.common import (
@@ -561,7 +562,7 @@ def advance_logprob_pt(i: int, req: Req) -> None:
         optimistic_reqs = [
             (i, req)
             for i, req in enumerate(batch.reqs)
-            if req.pending_bootstrap and req.inflight_middle_chunks <= 0
+            if req.pending_bootstrap and req.phase is not ReqPhase.EXTEND_NON_LAST
         ]
         if optimistic_reqs:
             polls = poll_and_all_reduce_attn_cp_tp_group(
@@ -573,10 +574,15 @@ def advance_logprob_pt(i: int, req: Req) -> None:
                 idx: poll for (idx, _), poll in zip(optimistic_reqs, polls)
             }
 
-        for i, (req, next_token_id) in enumerate(
-            zip(batch.reqs, next_token_ids, strict=True)
+        for i, (req, next_token_id, is_extend_intermediate) in enumerate(
+            zip(
+                batch.reqs,
+                next_token_ids,
+                batch.is_extend_intermediate,
+                strict=True,
+            )
         ):
-            if req.inflight_middle_chunks <= 0:
+            if not is_extend_intermediate:
                 req.time_stats.set_prefill_finished_time()
 
                 # For optimistic requests, check bootstrap before side effects
@@ -622,15 +628,15 @@ def advance_logprob_pt(i: int, req: Req) -> None:
                     except ValueError as e:
                         error_message = f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
                         release_kv_cache(req, self.tree_cache)
+                        self._deactivate_req(req)
                         prepare_abort(
                             req,
                             error_message,
                             status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
                         )
                     req.grammar.finished = req.finished()
             else:
-                # being chunked reqs' prefill is not finished
-                req.inflight_middle_chunks -= 1
+                # being partially-extended reqs' prefill is not finished
 
                 # Overlap deferred release for optimistic requests stopped in process_prefill_chunk
                 if req.pending_bootstrap:
@@ -722,6 +728,7 @@ def process_disagg_prefill_inflight_queue(
                 undone_reqs.append(req)
             elif poll == KVPoll.Success:  # transfer done
                 release_kv_cache(req, self.tree_cache)  # unlock the tree
+                self._deactivate_req(req)
                 req.finished_reason = FINISH_LENGTH(length=0)
                 # FIXME: clean up req's data in transfer engine
                 if hasattr(req.disagg_kv_sender, "clear"):
@@ -743,6 +750,7 @@ def process_disagg_prefill_inflight_queue(
                     logger.warning(error_message)
                 req.time_stats.trace_ctx.abort(abort_info={"reason": error_message})
                 release_kv_cache(req, self.tree_cache)  # unlock the tree
+                self._deactivate_req(req)
                 prepare_abort(
                     req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
                 )
@@ -839,6 +847,15 @@ def handle_bootstrap_failure(self: Scheduler, req: Req) -> None:
             self.metrics_collector.increment_bootstrap_failed_reqs()
         if self.enable_hicache_storage:
             self.tree_cache.release_aborted_request(req.rid)
+        # The stateless scheduler derives the current partially-extended req from
+        # partially_extended_reqs() = active_reqs entries in the EXTEND_NON_LAST
+        # phase, which ignores req.finished(). An aborted req still
+        # sitting in active_reqs with a non-None extend_range would be re-derived
+        # as a partially-extended req and crash process_prefill_chunk (req_pool_idx=None).
+        # Remove it from active_reqs and clear the extend state defensively.
+        req.extend_range = None
+        req.phase = ReqPhase.OTHERS
+        self._deactivate_req(req)
 
     def handle_pending_bootstrap(
         self: Scheduler, req: Req, poll: KVPoll, defer_release: bool
@@ -881,35 +898,31 @@ def check_bootstrap(self: Scheduler, req: Req) -> bool:
         )
 
     def process_prefill_chunk(self: Scheduler) -> None:
-        chunked_req_to_exclude = set()
-        if self.chunked_req:
-            chunked_req_to_exclude.add(self.chunked_req)
-            maybe_cache_unfinished_req(self.chunked_req, self.tree_cache, chunked=True)
-
-            if not self.check_bootstrap(self.chunked_req):
-                self.chunked_req = None  # stop the current chunked prefill
-            elif self.enable_overlap:
-                # Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
-                self.chunked_req.tmp_end_idx = min(
-                    self.chunked_req.extend_range.end,
-                    len(self.chunked_req.origin_input_ids),
-                )
-            else:
-                self.send_kv_chunk(self.chunked_req)
-
-            if self.chunked_req is not None:
+        partially_extended_req = next(iter(self.partially_extended_reqs()), None)
+        if partially_extended_req is not None:
+            maybe_cache_unfinished_req(
+                partially_extended_req, self.tree_cache, is_partially_extended=True
+            )
+            if self.check_bootstrap(partially_extended_req):
+                if self.enable_overlap:
+                    # Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
+                    partially_extended_req.tmp_end_idx = min(
+                        partially_extended_req.extend_range.end,
+                        len(partially_extended_req.origin_input_ids),
+                    )
+                else:
+                    self.send_kv_chunk(partially_extended_req)
                 self.running_batch.batch_is_full = False
+            else:
+                # Bootstrap not ready (deferred overlap poll) or failed: stop resuming
+                # this chunked prefill, mirroring the old `chunked_req = None`. The
+                # deferred optimistic_release_and_requeue (or the failure handler) owns
+                # the req from here; deactivating again there is an idempotent no-op.
+                self._deactivate_req(partially_extended_req)
 
         if self.last_batch and self.last_batch.forward_mode.is_extend():
-            if self.last_batch.chunked_req:
-                # In the context pipeline parallelism, after the last chunk, the current microbatch still track outdated chunked_req.
-                # We need to discard it.
-                chunked_req_to_exclude.add(self.last_batch.chunked_req)
-
             last_bs = self.last_batch.batch_size()
-            self.last_batch.filter_batch(
-                chunked_req_to_exclude=list(chunked_req_to_exclude)
-            )
+            self.last_batch.filter_batch(skip_extend_intermediate=True)
             if self.last_batch.batch_size() < last_bs:
                 self.running_batch.batch_is_full = False
 
@@ -1017,6 +1030,12 @@ def optimistic_release_and_requeue(self: Scheduler, req: Req) -> None:
         maybe_cache_unfinished_req(req, self.tree_cache)
         release_kv_cache(req, self.tree_cache)
         req.reset_for_retract()
+        # reset_for_retract() clears extend_range, but the req is still in
+        # active_reqs. Since the stateless scheduler derives partially_extended_reqs() from
+        # active_reqs, a requeued req that also remains active would be
+        # double-tracked (stale partially-extended req, load/accounting leak). Deactivate it
+        # before re-enqueue; get_next_batch_to_run reactivates it on reschedule.
+        self._deactivate_req(req)
         req.output_ids = array("q")
         req.start_send_idx = 0
         req.tmp_end_idx = -1

@@ -199,7 +199,6 @@ def _update_state_for_batch(
 
         if can_run_list:
             self.dllm_manager.add_staging_reqs(can_run_list)
-            self.dllm_manager.increment_inflight_middle_chunks()
 
         self.adder = adder
         self.can_run_list = can_run_list
@@ -255,9 +254,8 @@ def process_dllm_incoming_reqs(
 
             # Prepare and add request
             req.init_next_round_input(self.tree_cache)
-            res = adder.add_one_req(
+            res = adder.add_unstarted_extend_req(
                 req,
-                has_chunked_req=True,
                 truncation_align_size=self.truncation_align_size,
             )
 
@@ -337,11 +335,6 @@ def is_empty(self) -> bool:
             return True
         return len(self.waiting_queue) == 0
 
-    def increment_inflight_middle_chunks(self) -> None:
-        """Increment chunked count for all staging requests."""
-        for req in self.staging_queue:
-            req.inflight_middle_chunks += 1
-
     def filter_finished_reqs(self) -> None:
         """Remove finished requests from both queues."""
         self.waiting_queue = [req for req in self.waiting_queue if not req.finished()]

diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
@@ -246,6 +246,7 @@ class Envs:
     SGLANG_RECORD_STEP_TIME = EnvBool(False)
     SGLANG_FORCE_SHUTDOWN = EnvBool(False)
     SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
+    SGLANG_DEBUG_REQS_INVARIANTS = EnvBool(False)
     SGLANG_DEBUG_REVERT_PR = EnvInt(0)
     SGLANG_PHASE_CHECKER_DEBUG = EnvBool(False)
     SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)