sgl-project
diff --git a/‎python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py‎
Lines changed: 25 additions & 7 deletions b/‎python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py‎
Lines changed: 25 additions & 7 deletions
diff --git a/‎python/sglang/srt/layers/attention/flashattention_backend.py‎
Lines changed: 13 additions & 2 deletions b/‎python/sglang/srt/layers/attention/flashattention_backend.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎python/sglang/srt/layers/logits_processor.py‎
Lines changed: 3 additions & 0 deletions b/‎python/sglang/srt/layers/logits_processor.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/sglang/srt/managers/schedule_batch.py‎
Lines changed: 34 additions & 2 deletions b/‎python/sglang/srt/managers/schedule_batch.py‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎python/sglang/srt/managers/scheduler.py‎
Lines changed: 54 additions & 5 deletions b/‎python/sglang/srt/managers/scheduler.py‎
Lines changed: 54 additions & 5 deletions
diff --git a/‎python/sglang/srt/managers/scheduler_output_processor_mixin.py‎
Lines changed: 55 additions & 14 deletions b/‎python/sglang/srt/managers/scheduler_output_processor_mixin.py‎
Lines changed: 55 additions & 14 deletions
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+import os
 from http import HTTPStatus
 from typing import TYPE_CHECKING
 
@@ -146,14 +147,31 @@ def process_prebuilt_extend(
             hidden_states = torch.stack(hidden_states_list, dim=0).to(self.device)
 
             # local import to avoid circular import
-            from sglang.srt.speculative.eagle_utils import EagleDraftInput
+            if (
+                os.environ.get("SGLANG_ENABLE_EXPERIMENTAL_EAGLE_OVERLAP_SCHEDULE", "0")
+                == "1"
+            ):
+                from sglang.srt.speculative.eagle_utils_for_overlap_scheduler import (
+                    EagleDraftInput,
+                )
+
+                spec_info = EagleDraftInput(
+                    topk_p=topk_p,
+                    topk_index=topk_index,
+                    hidden_states=hidden_states,
+                    verified_id=self.output_ids,
+                    spec_steps=server_args.speculative_num_steps,
+                )
+            else:
+                from sglang.srt.speculative.eagle_utils import EagleDraftInput
+
+                spec_info = EagleDraftInput(
+                    topk_p=topk_p,
+                    topk_index=topk_index,
+                    hidden_states=hidden_states,
+                    verified_id=self.output_ids,
+                )
 
-            spec_info = EagleDraftInput(
-                topk_p=topk_p,
-                topk_index=topk_index,
-                hidden_states=hidden_states,
-                verified_id=self.output_ids,
-            )
             spec_info.prepare_for_extend(self)
             spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
             self.spec_info = spec_info
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional, Union
 
@@ -13,7 +14,14 @@
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.mem_cache.memory_pool import SWAKVPool
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
-from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+
+if os.environ.get("SGLANG_ENABLE_EXPERIMENTAL_EAGLE_OVERLAP_SCHEDULE", "0") == "1":
+    from sglang.srt.speculative.eagle_utils_for_overlap_scheduler import (
+        EagleDraftInput,
+        EagleVerifyInput,
+    )
+else:
+    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
 
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
@@ -1894,7 +1902,10 @@ def init_forward_metadata_replay_cuda_graph(
                 torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
             )
             accept_length = spec_info.accept_length[:bs]
-            if spec_info.accept_length_cpu:
+            if getattr(spec_info, "spec_steps", None) is not None:
+                # EAGLE + Overlap scheduling code path
+                metadata.max_seq_len_q = spec_info.spec_steps + 1
+            elif spec_info.accept_length_cpu:
                 metadata.max_seq_len_q = max(spec_info.accept_length_cpu) + 1
             else:
                 metadata.max_seq_len_q = 1
 
@@ -59,6 +59,9 @@ class LogitsProcessorOutput:
     # Used by speculative decoding (EAGLE)
     # The last hidden layers
     hidden_states: Optional[torch.Tensor] = None
+    # Used by speculative decoding (EAGLE) + overlap scheduling
+    # Speculative accept lengths
+    accept_length: Optional[torch.Tensor] = None
 
     ## Part 2: This part will be assigned in python/sglang/srt/layers/sampler.py::Sampler
     # The logprobs of the next tokens.                              shape: [#seq]
 
@@ -108,6 +108,9 @@
     "quantization",
     "enable_custom_logit_processor",
     "disaggregation_mode",
+    "speculative_num_steps",
+    "speculative_eagle_topk",
+    "speculative_num_draft_tokens",
 ]
 
 # Put some global args for easy access
@@ -903,6 +906,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # Speculative decoding
     spec_algorithm: SpeculativeAlgorithm = None
     spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None
+    # Used for EAGLE + Overlap scheduling only. Stores the temporary draft output token locations.
+    draft_out_cache_loc: Optional[torch.Tensor] = None
 
     # Whether to return hidden states
     return_hidden_states: bool = False
@@ -1540,7 +1545,22 @@ def prepare_for_decode(self):
         self.forward_mode = ForwardMode.DECODE
         bs = len(self.reqs)
 
-        if self.spec_algorithm.is_eagle():
+        if self.enable_overlap and self.spec_algorithm.is_eagle():
+            assert (
+                self.token_to_kv_pool_allocator.page_size == 1
+            ), "Eagle + Overlap Scheduler currently only supports page size 1"
+            self.draft_out_cache_loc, backup_state = self.alloc_token_slots(
+                bs
+                * global_server_args_dict["speculative_num_steps"]
+                * global_server_args_dict["speculative_eagle_topk"],
+                backup_state=True,
+            )
+            self.token_to_kv_pool_allocator.restore_state(backup_state)
+            self.out_cache_loc = self.alloc_token_slots(
+                bs * global_server_args_dict["speculative_num_draft_tokens"]
+            )
+            return
+        elif self.spec_algorithm.is_eagle():
             # if spec decoding is used, the decode batch is prepared inside
             # `forward_batch_speculative_generation` after running draft models.
             return
@@ -1648,11 +1668,20 @@ def filter_batch(
         if self.multimodal_inputs is not None:
             self.multimodal_inputs = [self.multimodal_inputs[i] for i in keep_indices]
         self.req_pool_indices = self.req_pool_indices[keep_indices_device]
+
+        if self.spec_algorithm.is_eagle() and self.enable_overlap:
+            # In eagle overlap mode, seq_lens is mutated in the EagleWorkerClient's forward_stream,
+            # but we copy seq_lens in the scheduler's stream. This is a problem because seq_lens may
+            # not have been mutated by EagleWorkerClient before the scheduler stream starts making
+            # a copy of it. To avoid this, we synchronize all streams before copying seq_lens.
+            torch.cuda.synchronize()
+
         self.seq_lens = self.seq_lens[keep_indices_device]
         self.orig_seq_lens = self.orig_seq_lens[keep_indices_device]
         self.out_cache_loc = None
         self.seq_lens_sum = self.seq_lens.sum().item()
-        self.output_ids = self.output_ids[keep_indices_device]
+        if self.output_ids is not None:
+            self.output_ids = self.output_ids[keep_indices_device]
         self.return_logprob = any(req.return_logprob for req in self.reqs)
         if self.return_logprob:
             self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in keep_indices]
@@ -1766,6 +1795,7 @@ def get_model_worker_batch(
             token_type_ids=self.token_type_ids,
             spec_algorithm=self.spec_algorithm,
             spec_info=self.spec_info,
+            draft_out_cache_loc=self.draft_out_cache_loc,
             hicache_consumer_index=self.hicache_consumer_index,
             capture_hidden_mode=(
                 CaptureHiddenMode.FULL
@@ -1918,6 +1948,8 @@ class ModelWorkerBatch:
     # If set, the output of the batch contains the hidden states of the run.
     capture_hidden_mode: CaptureHiddenMode = None
     hicache_consumer_index: int = 0
+    # Used for EAGLE + Overlap scheduling only. Stores the temporary draft output token locations.
+    draft_out_cache_loc: Optional[torch.Tensor] = None
 
     # Overlap event
     launch_done: Optional[threading.Event] = None
 
@@ -317,7 +317,7 @@ def __init__(
             logger.info("Overlap scheduler is disabled for embedding models.")
 
         # Launch a tensor parallel worker
-        if self.enable_overlap:
+        if self.enable_overlap and not self.spec_algorithm.is_eagle():
             TpWorkerClass = TpModelWorkerClient
         else:
             TpWorkerClass = TpModelWorker
@@ -334,9 +334,16 @@ def __init__(
 
         # Launch a draft worker for speculative decoding
         if self.spec_algorithm.is_eagle():
-            from sglang.srt.speculative.eagle_worker import EAGLEWorker
+            if self.enable_overlap:
+                from sglang.srt.speculative.eagle_worker_overlap_thread import (
+                    EAGLEWorkerClient as EAGLEWorkerClass,
+                )
+            else:
+                from sglang.srt.speculative.eagle_worker import (
+                    EAGLEWorker as EAGLEWorkerClass,
+                )
 
-            self.draft_worker = EAGLEWorker(
+            self.draft_worker = EAGLEWorkerClass(
                 gpu_id=gpu_id,
                 tp_rank=tp_rank,
                 moe_ep_rank=moe_ep_rank,
@@ -820,15 +827,25 @@ def event_loop_overlap(self):
                     tmp_batch = ScheduleBatch(
                         reqs=None,
                         forward_mode=ForwardMode.DUMMY_FIRST,
-                        next_batch_sampling_info=self.tp_worker.cur_sampling_info,
+                        next_batch_sampling_info=(
+                            self.draft_worker.cur_sampling_info
+                            if self.enable_overlap and self.spec_algorithm.is_eagle()
+                            else self.tp_worker.cur_sampling_info
+                        ),
                     )
                     self.process_batch_result(tmp_batch, None, batch.launch_done)
 
             if self.last_batch:
                 # Process the results of the last batch
                 tmp_batch, tmp_result = self.result_queue.popleft()
                 tmp_batch.next_batch_sampling_info = (
-                    self.tp_worker.cur_sampling_info if batch else None
+                    (
+                        self.draft_worker.cur_sampling_info
+                        if self.enable_overlap and self.spec_algorithm.is_eagle()
+                        else self.tp_worker.cur_sampling_info
+                    )
+                    if batch
+                    else None
                 )
                 # NOTE: we should use current launched batch's launch_done event Instead of the last batch's
                 self.process_batch_result(
@@ -1789,6 +1806,38 @@ def run_batch(
                         self.tp_worker.forward_batch_generation(model_worker_batch)
                     )
                 bid = model_worker_batch.bid
+            elif self.enable_overlap:
+                if batch.has_grammar:
+                    raise NotImplementedError(
+                        "Grammar + EAGLE + Overlap is not supported for now"
+                    )
+
+                model_worker_batch = batch.get_model_worker_batch()
+                if self.enable_overlap:
+                    # TODO (timmy): Do not alias seq_lens between forward and scheduler threads.
+                    # Optimistically estimate the seq_lens_cpu for the next draft forward
+                    model_worker_batch.seq_lens_cpu.add_(
+                        self.server_args.speculative_num_steps + 1
+                    )
+
+                # Populate fields needed to reuse batch for verify
+                model_worker_batch.extend_seq_lens = batch.extend_lens
+                model_worker_batch.extend_prefix_lens = batch.prefix_lens
+                model_worker_batch.extend_logprob_start_lens = (
+                    batch.extend_logprob_start_lens
+                )
+
+                (
+                    logits_output,
+                    next_token_ids,
+                    free_cache_loc_cpu,
+                    bid,
+                    can_run_cuda_graph,
+                    next_spec_info,
+                ) = self.draft_worker.forward_batch_speculative_generation(
+                    model_worker_batch
+                )
+                batch.spec_info = next_spec_info
             else:
                 (
                     logits_output,
 
@@ -51,9 +51,14 @@ def process_batch_result_prefill(
             )
 
             if self.enable_overlap:
-                logits_output, next_token_ids, _ = (
-                    self.tp_worker.resolve_last_batch_result(launch_done)
-                )
+                if self.spec_algorithm.is_eagle():
+                    logits_output, next_token_ids, _, _, _, _ = (
+                        self.draft_worker.resolve_last_batch_result(launch_done)
+                    )
+                else:
+                    logits_output, next_token_ids, _ = (
+                        self.tp_worker.resolve_last_batch_result(launch_done)
+                    )
             else:
                 # Move next_token_ids and logprobs to cpu
                 next_token_ids = next_token_ids.tolist()
@@ -205,9 +210,25 @@ def process_batch_result_decode(
         self.num_generated_tokens += len(batch.reqs)
 
         if self.enable_overlap:
-            logits_output, next_token_ids, can_run_cuda_graph = (
-                self.tp_worker.resolve_last_batch_result(launch_done)
-            )
+            if self.spec_algorithm.is_eagle():
+                (
+                    logits_output,
+                    next_token_ids,
+                    free_cache_loc_cpu,
+                    # Note: It's important we use out_cache_loc here and not batch.out_cache_loc.
+                    # out_cache_loc stores the out cache locations for the accepted tokens in
+                    # the target verify step, which is what we want. However, batch.out_cache_loc
+                    # contains the out cache locations for all tokens. If we use that, we will end
+                    # up freeing the wrong locations when we free extra delayed tokens in specdec.
+                    out_cache_loc,
+                    _,
+                    can_run_cuda_graph,
+                ) = self.draft_worker.resolve_last_batch_result(launch_done)
+            else:
+                logits_output, next_token_ids, can_run_cuda_graph = (
+                    self.tp_worker.resolve_last_batch_result(launch_done)
+                )
+                out_cache_loc = batch.out_cache_loc
             next_token_logprobs = logits_output.next_token_logprobs
         elif batch.spec_algorithm.is_none():
             # spec decoding handles output logprobs inside verify process.
@@ -217,28 +238,46 @@ def process_batch_result_decode(
 
         self.token_to_kv_pool_allocator.free_group_begin()
 
+        if self.enable_overlap and self.spec_algorithm.is_eagle():
+            if free_cache_loc_cpu is not None:
+                free_cache_loc_cpu = free_cache_loc_cpu[free_cache_loc_cpu != 0]
+                self.token_to_kv_pool_allocator.free(
+                    free_cache_loc_cpu.to("cuda", non_blocking=True)
+                )
+
+            accept_length = logits_output.accept_length.tolist()
+            idx_to_batch = [
+                i for i, length in enumerate(accept_length) for _ in range(length + 1)
+            ]
+
+            num_generated_tokens_this_batch = len(idx_to_batch)
+            self.num_generated_tokens += num_generated_tokens_this_batch
+            self.spec_num_total_accepted_tokens += num_generated_tokens_this_batch
+            self.spec_num_total_forward_ct += len(batch.reqs)
+        else:
+            idx_to_batch = list(range(len(batch.reqs)))
+
         # Check finish condition
         # NOTE: the length of reqs and next_token_ids don't match if it is spec decoding.
         # We should ignore using next_token_ids for spec decoding cases.
-        for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
+        for i, (b, next_token_id) in enumerate(zip(idx_to_batch, next_token_ids)):
+            req = batch.reqs[b]
             if req.is_retracted:
                 continue
 
             if self.enable_overlap and req.finished():
                 # Free the one extra delayed token
                 if self.page_size == 1:
-                    self.token_to_kv_pool_allocator.free(batch.out_cache_loc[i : i + 1])
+                    self.token_to_kv_pool_allocator.free(out_cache_loc[i : i + 1])
                 else:
                     # Only free when the extra token is in a new page
                     if (
                         len(req.origin_input_ids) + len(req.output_ids) - 1
                     ) % self.page_size == 0:
-                        self.token_to_kv_pool_allocator.free(
-                            batch.out_cache_loc[i : i + 1]
-                        )
+                        self.token_to_kv_pool_allocator.free(out_cache_loc[i : i + 1])
                 continue
 
-            if batch.spec_algorithm.is_none():
+            if batch.spec_algorithm.is_none() or self.enable_overlap:
                 # speculative worker will solve the output_ids in speculative decoding
                 req.output_ids.append(next_token_id)
 
@@ -247,8 +286,10 @@ def process_batch_result_decode(
                 self.tree_cache.cache_finished_req(req)
                 req.time_stats.completion_time = time.time()
 
-            if req.return_logprob and batch.spec_algorithm.is_none():
-                # speculative worker handles logprob in speculative decoding
+            if req.return_logprob and (
+                batch.spec_algorithm.is_none() or self.enable_overlap
+            ):
+                # non-overlap speculative worker handles logprob in speculative decoding
                 req.output_token_logprobs_val.append(next_token_logprobs[i])
                 req.output_token_logprobs_idx.append(next_token_id)
                 if req.top_logprobs_num > 0: