@@ -173,6 +173,77 @@ index 1c541914c..6ed0e522d 100644
173173
174174 async def init_weights_send_group_for_remote_instance(
175175 self,
176+ diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
177+ index 3d901ceb5..af9554b9a 100644
178+ --- a/python/sglang/srt/managers/tokenizer_manager.py
179+ +++ b/python/sglang/srt/managers/tokenizer_manager.py
180+ @@ -1060,6 +1060,9 @@ class TokenizerManager(TokenizerCommunicatorMixin):
181+ async with self.is_pause_cond:
182+ self.is_pause = True
183+ self.abort_request(abort_all=True)
184+ + # do double abort to ensure all in-flight requests are aborted
185+ + await asyncio.sleep(1)
186+ + self.abort_request(abort_all=True)
187+
188+ async def continue_generation(self):
189+ async with self.is_pause_cond:
190+ @@ -1514,12 +1517,13 @@ class TokenizerManager(TokenizerCommunicatorMixin):
191+ return
192+
193+ if len(recv_obj.input_token_logprobs_val) > 0:
194+ - state.input_token_logprobs_val.extend(
195+ - recv_obj.input_token_logprobs_val[recv_obj_index]
196+ - )
197+ - state.input_token_logprobs_idx.extend(
198+ - recv_obj.input_token_logprobs_idx[recv_obj_index]
199+ - )
200+ + if recv_obj.input_token_logprobs_val[recv_obj_index]:
201+ + state.input_token_logprobs_val.extend(
202+ + recv_obj.input_token_logprobs_val[recv_obj_index]
203+ + )
204+ + state.input_token_logprobs_idx.extend(
205+ + recv_obj.input_token_logprobs_idx[recv_obj_index]
206+ + )
207+ state.output_token_logprobs_val.extend(
208+ recv_obj.output_token_logprobs_val[recv_obj_index]
209+ )
210+ @@ -1731,14 +1735,24 @@ class TokenizerManager(TokenizerCommunicatorMixin):
211+ state.finished = True
212+ if recv_obj.finished_reason:
213+ out = {
214+ + "text": "",
215+ + "output_ids": [],
216+ "meta_info": {
217+ "id": recv_obj.rid,
218+ "finish_reason": recv_obj.finished_reason,
219+ + "prompt_tokens": 0,
220+ + "completion_tokens": 0,
221+ + "model_version": self.server_args.weight_version,
222+ + "cached_tokens": 0,
223+ + "e2e_latency": 0,
224+ + "output_token_logprobs": [[]],
225+ + "input_token_logprobs": [[]],
226+ },
227+ }
228+ else:
229+ out = {
230+ "text": "",
231+ + "output_ids": [],
232+ "meta_info": {
233+ "id": origin_rid,
234+ "finish_reason": {
235+ @@ -1747,6 +1761,11 @@ class TokenizerManager(TokenizerCommunicatorMixin):
236+ },
237+ "prompt_tokens": 0,
238+ "completion_tokens": 0,
239+ + "model_version": self.server_args.weight_version,
240+ + "cached_tokens": 0,
241+ + "e2e_latency": 0,
242+ + "output_token_logprobs": [[]],
243+ + "input_token_logprobs": [[]],
244+ },
245+ }
246+ state.out_list.append(out)
176247diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
177248index 0a1cededd..0093fe2a8 100644
178249--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
0 commit comments