sgl-project
diff --git a/‎python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py‎
Lines changed: 7 additions & 25 deletions b/‎python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py‎
Lines changed: 7 additions & 25 deletions
diff --git a/‎python/sglang/srt/layers/attention/flashattention_backend.py‎
Lines changed: 6 additions & 11 deletions b/‎python/sglang/srt/layers/attention/flashattention_backend.py‎
Lines changed: 6 additions & 11 deletions
diff --git a/‎python/sglang/srt/managers/schedule_batch.py‎
Lines changed: 5 additions & 3 deletions b/‎python/sglang/srt/managers/schedule_batch.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎python/sglang/srt/managers/scheduler.py‎
Lines changed: 0 additions & 1 deletion b/‎python/sglang/srt/managers/scheduler.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py‎
Lines changed: 7 additions & 24 deletions b/‎python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py‎
Lines changed: 7 additions & 24 deletions
diff --git a/‎python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py‎
Lines changed: 5 additions & 24 deletions b/‎python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py‎
Lines changed: 5 additions & 24 deletions
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import logging
-import os
 from http import HTTPStatus
 from typing import TYPE_CHECKING
 
@@ -147,31 +146,14 @@ def process_prebuilt_extend(
             hidden_states = torch.stack(hidden_states_list, dim=0).to(self.device)
 
             # local import to avoid circular import
-            if (
-                os.environ.get("SGLANG_ENABLE_EXPERIMENTAL_EAGLE_OVERLAP_SCHEDULE", "0")
-                == "1"
-            ):
-                from sglang.srt.speculative.eagle_utils_for_overlap_scheduler import (
-                    EagleDraftInput,
-                )
-
-                spec_info = EagleDraftInput(
-                    topk_p=topk_p,
-                    topk_index=topk_index,
-                    hidden_states=hidden_states,
-                    verified_id=self.output_ids,
-                    spec_steps=server_args.speculative_num_steps,
-                )
-            else:
-                from sglang.srt.speculative.eagle_utils import EagleDraftInput
-
-                spec_info = EagleDraftInput(
-                    topk_p=topk_p,
-                    topk_index=topk_index,
-                    hidden_states=hidden_states,
-                    verified_id=self.output_ids,
-                )
+            from sglang.srt.speculative.eagle_utils import EagleDraftInput
 
+            spec_info = EagleDraftInput(
+                topk_p=topk_p,
+                topk_index=topk_index,
+                hidden_states=hidden_states,
+                verified_id=self.output_ids,
+            )
             spec_info.prepare_for_extend(self)
             spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
             self.spec_info = spec_info
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional, Union
 
@@ -14,14 +13,7 @@
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.mem_cache.memory_pool import SWAKVPool
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
-
-if os.environ.get("SGLANG_ENABLE_EXPERIMENTAL_EAGLE_OVERLAP_SCHEDULE", "0") == "1":
-    from sglang.srt.speculative.eagle_utils_for_overlap_scheduler import (
-        EagleDraftInput,
-        EagleVerifyInput,
-    )
-else:
-    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
 
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
@@ -340,6 +332,9 @@ def __init__(
                 model_runner.token_to_kv_pool.full_to_swa_index_mapping
             )
         self.topk = model_runner.server_args.speculative_eagle_topk or 0
+        self.enable_overlap_schedule = (
+            not model_runner.server_args.disable_overlap_schedule
+        )
         self.speculative_num_steps = speculative_num_steps
         self.speculative_num_draft_tokens = (
             model_runner.server_args.speculative_num_draft_tokens
@@ -1902,9 +1897,9 @@ def init_forward_metadata_replay_cuda_graph(
                 torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
             )
             accept_length = spec_info.accept_length[:bs]
-            if getattr(spec_info, "spec_steps", None) is not None:
+            if self.enable_overlap_schedule:
                 # EAGLE + Overlap scheduling code path
-                metadata.max_seq_len_q = spec_info.spec_steps + 1
+                metadata.max_seq_len_q = self.speculative_num_steps + 1
             elif spec_info.accept_length_cpu:
                 metadata.max_seq_len_q = max(spec_info.accept_length_cpu) + 1
             else:
 
@@ -906,7 +906,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # Speculative decoding
     spec_algorithm: SpeculativeAlgorithm = None
     spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None
-    # Used for EAGLE + Overlap scheduling only. Stores the temporary draft output token locations.
+    # Used for EAGLE + Overlap scheduling. Stores the temporary draft output KV cache locations.
     draft_out_cache_loc: Optional[torch.Tensor] = None
 
     # Whether to return hidden states
@@ -1695,7 +1695,9 @@ def filter_batch(
 
         self.sampling_info.filter_batch(keep_indices, keep_indices_device)
         if self.spec_info:
-            self.spec_info.filter_batch(keep_indices_device)
+            self.spec_info.filter_batch(
+                keep_indices_device, has_been_filtered=not self.enable_overlap
+            )
 
     def merge_batch(self, other: "ScheduleBatch"):
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
@@ -1948,7 +1950,7 @@ class ModelWorkerBatch:
     # If set, the output of the batch contains the hidden states of the run.
     capture_hidden_mode: CaptureHiddenMode = None
     hicache_consumer_index: int = 0
-    # Used for EAGLE + Overlap scheduling only. Stores the temporary draft output token locations.
+    # Used for EAGLE + Overlap scheduling. Stores the temporary draft output KV cache locations.
     draft_out_cache_loc: Optional[torch.Tensor] = None
 
     # Overlap event
 
@@ -1814,7 +1814,6 @@ def run_batch(
 
                 model_worker_batch = batch.get_model_worker_batch()
                 if self.enable_overlap:
-                    # TODO (timmy): Do not alias seq_lens between forward and scheduler threads.
                     # Optimistically estimate the seq_lens_cpu for the next draft forward
                     model_worker_batch.seq_lens_cpu.add_(
                         self.server_args.speculative_num_steps + 1
 
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import bisect
-import os
 from typing import TYPE_CHECKING, Callable
 
 import torch
@@ -21,12 +20,7 @@
     ForwardBatch,
     ForwardMode,
 )
-
-if os.environ.get("SGLANG_ENABLE_EXPERIMENTAL_EAGLE_OVERLAP_SCHEDULE", "0") == "1":
-    from sglang.srt.speculative.eagle_utils_for_overlap_scheduler import EagleDraftInput
-else:
-    from sglang.srt.speculative.eagle_utils import EagleDraftInput
-
+from sglang.srt.speculative.eagle_utils import EagleDraftInput
 from sglang.srt.utils import (
     require_attn_tp_gather,
     require_gathered_buffer,
@@ -210,23 +204,12 @@ def capture_one_batch_size(self, num_seqs: int, forward: Callable):
             global_dp_buffer_len = None
             global_num_tokens_for_logprob = None
 
-        if (
-            os.environ.get("SGLANG_ENABLE_EXPERIMENTAL_EAGLE_OVERLAP_SCHEDULE", "0")
-            == "1"
-        ):
-            spec_info = EagleDraftInput(
-                topk_p=topk_p,
-                topk_index=topk_index,
-                hidden_states=hidden_states,
-                capture_hidden_mode=CaptureHiddenMode.LAST,
-            )
-        else:
-            spec_info = EagleDraftInput(
-                topk_p=topk_p,
-                topk_index=topk_index,
-                hidden_states=hidden_states,
-                capture_hidden_mode=CaptureHiddenMode.LAST,
-            )
+        spec_info = EagleDraftInput(
+            topk_p=topk_p,
+            topk_index=topk_index,
+            hidden_states=hidden_states,
+            capture_hidden_mode=CaptureHiddenMode.LAST,
+        )
 
         # Forward batch
         forward_batch = ForwardBatch(
 
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import bisect
-import os
 from typing import TYPE_CHECKING, Callable
 
 import torch
@@ -22,15 +21,7 @@
     ForwardBatch,
     ForwardMode,
 )
-
-if os.environ.get("SGLANG_ENABLE_EXPERIMENTAL_EAGLE_OVERLAP_SCHEDULE", "0") == "1":
-    from sglang.srt.speculative.eagle_utils_for_overlap_scheduler import (
-        EagleDraftInput,
-        fast_topk,
-    )
-else:
-    from sglang.srt.speculative.eagle_utils import EagleDraftInput, fast_topk
-
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, fast_topk
 from sglang.srt.utils import (
     require_attn_tp_gather,
     require_gathered_buffer,
@@ -236,20 +227,10 @@ def capture_one_batch_size(self, bs: int, forward: Callable):
         else:
             global_dp_buffer_len = None
 
-        if (
-            os.environ.get("SGLANG_ENABLE_EXPERIMENTAL_EAGLE_OVERLAP_SCHEDULE", "0")
-            == "1"
-        ):
-            spec_info = EagleDraftInput(
-                hidden_states=hidden_states,
-                accept_length=accept_length,
-                spec_steps=self.speculative_num_steps,
-            )
-        else:
-            spec_info = EagleDraftInput(
-                hidden_states=hidden_states,
-                accept_length=accept_length,
-            )
+        spec_info = EagleDraftInput(
+            hidden_states=hidden_states,
+            accept_length=accept_length,
+        )
         spec_info.positions = None
 
         # Forward batch