enable simple tune and change forward_batch, sampling_metadata, forward_metadata iniitilization position

aolemila · aolemila · commit 6da7daaa1148 · 2026-03-05T19:47:52.000+08:00
diff --git a/python/sgl_jax/srt/kernels/ragged_paged_attention/tuned_block_sizes.py b/python/sgl_jax/srt/kernels/ragged_paged_attention/tuned_block_sizes.py
diff --git a/python/sgl_jax/srt/managers/schedule_batch.py b/python/sgl_jax/srt/managers/schedule_batch.py
@@ -191,6 +191,7 @@ def __init__(
 
         # Each decode stage's output ids
         self.output_ids = []
+        # self.next_token_ids_device: jax.Array = None # store it and use device_get to get when need it
         # fill_ids = origin_input_ids + output_ids. Updated if chunked.
         self.fill_ids = []
 
@@ -407,6 +408,7 @@ def init_incremental_detokenize(self):
             self.surr_offset = max(self.read_offset - INIT_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
 
         all_ids = self.origin_input_ids_unpadded + self.output_ids
+        # all_ids = self.origin_input_ids_unpadded + self.output_ids[:-1] if self.next_token_ids_device else self.output_ids
         return all_ids[self.surr_offset :], self.read_offset - self.surr_offset
 
     def check_finished(self, new_accepted_len: int = 1):
@@ -1031,6 +1033,24 @@ def prepare_for_decode(self):
         if self.sampling_info.penalizer_orchestrator.is_required:
             if self.enable_overlap:
                 # TODO: this can be slow, optimize this.
+                # tmp=[]
+                # for req in self.reqs:
+                #     if req.next_token_ids_device:
+                #         output_id=jax.device_get(req.next_token_ids_device).tolist()[0]
+                #         req.next_token_ids_device=None
+                #         req.output_ids[-1]=output_id
+                #         print(f"[next_token_ids_device] {req.output_ids=}",flush=True)
+                #     elif len(req.output_ids):
+                #         output_id = req.output_ids[-1]
+                #         print(f"[len(req.output_ids):] {req.output_ids=}",flush=True)
+                #     else:
+                #         output_id = req.origin_input_ids[-1]
+                #         print(f"[other]] {req.output_ids=}",flush=True)
+                #     tmp.append(output_id)
+
+                # delayed_output_ids = np.array(tmp,dtype=np.int64)
+                # print(f"[prepare_for_decode] {tmp=}",flush=True)
+
                 delayed_output_ids = np.array(
                     [
                         (req.output_ids[-1] if len(req.output_ids) else req.origin_input_ids[-1])
@@ -1042,8 +1062,25 @@ def prepare_for_decode(self):
             else:
                 self.sampling_info.penalizer_orchestrator.cumulate_output_tokens(self.output_ids)
 
+        # output_ids = self.output_ids
+        # if self.enable_overlap:
+        #     print(f"=======overlap========",flush=True)
+        #     valid_output_ids = []
+        #     for req in self.reqs:
+        #         print(f"[for req] {req.next_token_ids_device=}, {req.output_ids=}",flush=True)
+        #         if req.next_token_ids_device:
+        #             output_id=jax.device_get(req.next_token_ids_device).tolist()[0]
+        #             req.next_token_ids_device=None
+        #             req.output_ids[-1]=output_id
+        #             valid_output_ids.append(output_id)
+        #             print(f"[next_token_ids_device 1] {req.output_ids=}",flush=True)
+        #     output_ids = np.concat(valid_output_ids,dtype=np.int32)
+
         # Update fields
         self.input_ids = self.output_ids
+        # self.input_ids = output_ids
+
+        # print(f"[prepare_for_decode] {self.input_ids=}",flush=True)
 
         self.output_ids = None
 
@@ -1215,7 +1252,9 @@ def get_model_worker_batch(
         seq_lens_cpu = self.seq_lens
         real_bs = len(seq_lens_cpu)
         req_pool_indices_cpu = self.req_pool_indices
-        token_indices_with_all_reqs = self.req_to_token_pool.req_to_token[self.req_pool_indices]
+        token_indices_with_all_reqs = self.req_to_token_pool.req_to_token[
+            self.req_pool_indices
+        ]  # cost in pathways, 23ms
 
         # padding seq
         # extend & decode: input_ids, positions, out_cache_loc, cache_loc
@@ -1313,6 +1352,8 @@ def get_model_worker_batch(
 
                 # Fill the array efficiently
                 offset = 0
+                ######################### cost in Pathways 10ms#####################
+                #####concurrecny=256,tp=4,page_size=256,max_running_requests=256
                 for i, (seq_idx, seq_len, aligned_len) in enumerate(
                     zip(valid_indices, valid_seq_lens, aligned_lengths)
                 ):
@@ -1322,6 +1363,7 @@ def get_model_worker_batch(
                     ]
                     # Padding is already zero from initialization
                     offset += aligned_len
+                ######################### cost in Pathways#####################
 
         offset = np.sum(seq_lens_cpu[seq_lens_cpu > 0]) if len(seq_lens_cpu) > 0 else 0
 
@@ -1335,6 +1377,7 @@ def get_model_worker_batch(
         if len(cache_loc_flat) < total_cache_loc_size:
             cache_loc_cpu[len(cache_loc_flat) :] = 0
 
+        ####################cost in Pathways 22ms######################
         if bs_padding_size > 0:
             invalid_req_pool_indices = np.array(
                 [-1] * bs_padding_size, dtype=req_pool_indices_cpu.dtype
@@ -1365,6 +1408,8 @@ def get_model_worker_batch(
                     [extend_logprob_start_lens, invalid_extend_logprob_start_lens], axis=0
                 )
 
+        ############################################################################
+
         sampling_info = self.sampling_info
         if self.sampling_info:
             new_temperatures = np.concatenate(
diff --git a/python/sgl_jax/srt/managers/scheduler_output_processor_mixin.py b/python/sgl_jax/srt/managers/scheduler_output_processor_mixin.py
@@ -89,23 +89,40 @@ def process_batch_result_prefill(
         hidden_state_offset = 0
         # Check finish conditions
         logprob_pt = 0
+        # print(f"[process_batch_result_prefill] {len(batch.reqs)=}",flush=True)
         for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
+            # for i, req in enumerate(batch.reqs):
             if req.is_retracted:
                 continue
 
             req.latest_bid = batch.bid
 
+            # if self.enable_overlap:
+            # next_token_id=next_token_ids[i]
+
             if self.is_mixed_chunk and self.enable_overlap and req.finished():
                 j = len(batch.out_cache_loc) - len(batch.reqs) + i
                 self.token_to_kv_pool_allocator.free(batch.out_cache_loc[j : j + 1])
                 continue
 
+            # print(f"[process_batch_result_prefill] {req.is_chunked=}, {next_token_id=}",flush=True)
             if req.is_chunked <= 0:
                 # req output_ids are set here
                 req.output_ids.append(next_token_id)
+                # if self.enable_overlap:
+                #     req.next_token_ids_device=next_token_id
+                #     req.output_ids.append(0)
+                # else:
+                #     req.output_ids.append(next_token_id)
+
+                # print(f"============[process_batch_result_prefill] {req.output_ids=}, {req.next_token_ids_device=}",flush=True)
+
                 req.check_finished()
 
                 if req.finished():
+                    # if req.next_token_ids_device:
+                    #     req.output_ids[-1] = jax.device_get(req.next_token_ids_device).tolist()[0]
+                    #     req.next_token_ids_device=None
                     self.maybe_collect_routed_experts(req)
                     if precision_tracer.get_trace_active():
                         precision_tracer.set_request_status_to_completed(req.rid)
@@ -275,6 +292,7 @@ def process_batch_result_decode(
 
         # Check finish condition
         for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
+            # for i, req in enumerate(batch.reqs):
             req: Req
             if req.is_retracted:
                 continue
@@ -292,15 +310,25 @@ def process_batch_result_decode(
                     self.token_to_kv_pool_allocator.free(indices_to_free)
                 continue
 
+            # next_token_id = next_token_ids[i]
+
             new_accepted_len = 1
             if batch.spec_algorithm is None or batch.spec_algorithm.is_none():
                 req.output_ids.append(next_token_id)
+                # if self.enable_overlap:
+                #     req.output_ids.append(0)
+                #     req.next_token_ids_device=next_token_id
+                # else:
+                #     req.output_ids.append(next_token_id)
             elif self.spec_algorithm.is_eagle():
                 req.output_ids.extend(next_token_id)
                 new_accepted_len = len(next_token_id)
 
             req.check_finished(new_accepted_len)
             if req.finished():
+                # if req.next_token_ids_device:
+                #     req.output_ids[-1] = jax.device_get(req.next_token_ids_device).tolist()[0]
+                #     req.next_token_ids_device=None
                 self.maybe_collect_routed_experts(req)
                 if batch.spec_algorithm is not None and batch.spec_algorithm.is_eagle():
                     cur_allocate_len = batch.spec_info.allocate_lens[i]
diff --git a/python/sgl_jax/srt/managers/scheduler_profiler_mixing.py b/python/sgl_jax/srt/managers/scheduler_profiler_mixing.py
@@ -38,15 +38,17 @@ def start_profile(
         if output_dir is None:
             output_dir = os.getenv("SGLANG_JAX_PROFILER_DIR", "/tmp")
 
-        # check permission for output_dir
-        tmp_output_dir = output_dir
-        while not os.path.exists(tmp_output_dir):
-            tmp_output_dir = os.path.dirname(tmp_output_dir)
-        if not os.access(tmp_output_dir, os.W_OK):
-            return ProfileReqOutput(
-                success=False,
-                message=f"no permission to write the {output_dir}",
-            )
+        if not output_dir.startswith("gs"):
+            # gs prefix is used in Pathways, skip check in Pathways
+            # check permission for output_dir
+            tmp_output_dir = output_dir
+            while not os.path.exists(tmp_output_dir):
+                tmp_output_dir = os.path.dirname(tmp_output_dir)
+            if not os.access(tmp_output_dir, os.W_OK):
+                return ProfileReqOutput(
+                    success=False,
+                    message=f"no permission to write the {output_dir}",
+                )
 
         self.profiler_output_dir = output_dir
         self.profile_id = profile_id
diff --git a/python/sgl_jax/srt/managers/tp_worker.py b/python/sgl_jax/srt/managers/tp_worker.py
@@ -606,6 +606,7 @@ def forward_batch_generation(
                     sampling_metadata,
                 )
                 cache_miss_count += count()
+                next_token_ids_device = jax.copy_to_host_async(next_token_ids_device)
             if model_worker_batch.return_output_logprob_only:
                 logprobs = self.model_runner.compute_logprobs(token_logprobs, next_token_ids_device)
                 logits_output.next_token_logprobs = logprobs[: model_worker_batch.real_bs]
diff --git a/python/sgl_jax/srt/managers/tp_worker_overlap_thread.py b/python/sgl_jax/srt/managers/tp_worker_overlap_thread.py
@@ -102,6 +102,22 @@ def forward_thread_func_(self):
             if not model_worker_batch:
                 break
 
+            # initialize forward_metadata, sampling_metadata and forward_batch for DeepSeek-R1-Distill-Qwen-1.5B
+            if sampling_metadata is None:
+                sampling_metadata = SamplingMetadata.from_model_worker_batch(
+                    model_worker_batch,
+                    len(model_worker_batch.seq_lens) - model_worker_batch.real_bs,
+                    self.mesh,
+                    self.worker.model_config.vocab_size,
+                )
+
+            forward_metadata = self.worker.model_runner.attn_backend.get_forward_metadata(
+                model_worker_batch
+            )
+            model_worker_batch.forward_batch = ForwardBatch.init_new(
+                model_worker_batch, self.worker.get_model_runner()
+            )
+
             # Resolve future tokens in the input
             input_ids = model_worker_batch.forward_batch.input_ids
             model_worker_batch.forward_batch.input_ids = resolve_future_token_ids(
@@ -118,7 +134,6 @@ def forward_thread_func_(self):
                         forward_metadata=forward_metadata,
                     )
                 )
-
             # Update the future token ids map
             self.future_token_ids_map = set_future_token_ids(
                 self.future_token_ids_map,
@@ -143,7 +158,7 @@ def resolve_last_batch_result(self, launch_done: threading.Event | None = None):
             ).tolist()
         if logits_output.hidden_states is not None:
             logits_output.hidden_states = jax.device_get(logits_output.hidden_states)
-        next_token_ids = jax.device_get(next_token_ids).tolist()
+        # next_token_ids = jax.device_get(next_token_ids).tolist()
 
         if launch_done is not None:
             launch_done.wait()
@@ -164,33 +179,33 @@ def forward_batch_generation(
             penalizer_orchestrator=None,
         )
 
-        if sampling_metadata is None:
-            sampling_metadata = SamplingMetadata.from_model_worker_batch(
-                model_worker_batch,
-                len(model_worker_batch.seq_lens) - model_worker_batch.real_bs,
-                self.mesh,
-                self.worker.model_config.vocab_size,
-            )
+        # if sampling_metadata is None:
+        #     sampling_metadata = SamplingMetadata.from_model_worker_batch(
+        #         model_worker_batch,
+        #         len(model_worker_batch.seq_lens) - model_worker_batch.real_bs,
+        #         self.mesh,
+        #         self.worker.model_config.vocab_size,
+        #     )
 
-        forward_metadata = self.worker.model_runner.attn_backend.get_forward_metadata(
-            model_worker_batch
-        )
+        # forward_metadata = self.worker.model_runner.attn_backend.get_forward_metadata(
+        #     model_worker_batch
+        # )
 
         # Prepare LoRA batch if LoRA is enabled
         if self.worker.server_args.enable_lora:
             self.worker.prepare_lora_batch(model_worker_batch)
 
-        model_worker_batch.forward_batch = ForwardBatch.init_new(
-            model_worker_batch, self.worker.get_model_runner()
-        )
+        # model_worker_batch.forward_batch = ForwardBatch.init_new(
+        #     model_worker_batch, self.worker.get_model_runner()
+        # )
 
         # Push a new batch to the queue (JAX handles synchronization automatically)
         self.input_queue.put(
             (
                 model_worker_batch,
                 self.future_token_ids_ct,
-                sampling_metadata,
-                forward_metadata,
+                None,  # sampling_metadata
+                None,  # forward_metadata
             )
         )
 

Original file line number	Diff line number	Diff line change
`@@ -606,6 +606,7 @@ def forward_batch_generation(`
`606`	`606`	`sampling_metadata,`
`607`	`607`	`)`
`608`	`608`	`cache_miss_count += count()`
	`609`	`+ next_token_ids_device = jax.copy_to_host_async(next_token_ids_device)`
`609`	`610`	`if model_worker_batch.return_output_logprob_only:`
`610`	`611`	`logprobs = self.model_runner.compute_logprobs(token_logprobs, next_token_ids_device)`
`611`	`612`	`logits_output.next_token_logprobs = logprobs[: model_worker_batch.real_bs]`