add ad-hoc wait for large moe

zhuzilin · zhuzilin · commit f8d4cd3155db · 2025-09-30T11:06:31.000Z
diff --git a/docker/patch/latest/sglang.patch b/docker/patch/latest/sglang.patch
@@ -173,6 +173,77 @@ index 1c541914c..6ed0e522d 100644
  
      async def init_weights_send_group_for_remote_instance(
          self,
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 3d901ceb5..af9554b9a 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
++++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -1060,6 +1060,9 @@ class TokenizerManager(TokenizerCommunicatorMixin):
+         async with self.is_pause_cond:
+             self.is_pause = True
+             self.abort_request(abort_all=True)
++            # do double abort to ensure all in-flight requests are aborted
++            await asyncio.sleep(1)
++            self.abort_request(abort_all=True)
+ 
+     async def continue_generation(self):
+         async with self.is_pause_cond:
+@@ -1514,12 +1517,13 @@ class TokenizerManager(TokenizerCommunicatorMixin):
+             return
+ 
+         if len(recv_obj.input_token_logprobs_val) > 0:
+-            state.input_token_logprobs_val.extend(
+-                recv_obj.input_token_logprobs_val[recv_obj_index]
+-            )
+-            state.input_token_logprobs_idx.extend(
+-                recv_obj.input_token_logprobs_idx[recv_obj_index]
+-            )
++            if recv_obj.input_token_logprobs_val[recv_obj_index]:
++                state.input_token_logprobs_val.extend(
++                    recv_obj.input_token_logprobs_val[recv_obj_index]
++                )
++                state.input_token_logprobs_idx.extend(
++                    recv_obj.input_token_logprobs_idx[recv_obj_index]
++                )
+         state.output_token_logprobs_val.extend(
+             recv_obj.output_token_logprobs_val[recv_obj_index]
+         )
+@@ -1731,14 +1735,24 @@ class TokenizerManager(TokenizerCommunicatorMixin):
+         state.finished = True
+         if recv_obj.finished_reason:
+             out = {
++                "text": "",
++                "output_ids": [],
+                 "meta_info": {
+                     "id": recv_obj.rid,
+                     "finish_reason": recv_obj.finished_reason,
++                    "prompt_tokens": 0,
++                    "completion_tokens": 0,
++                    "model_version": self.server_args.weight_version,
++                    "cached_tokens": 0,
++                    "e2e_latency": 0,
++                    "output_token_logprobs": [[]],
++                    "input_token_logprobs": [[]],
+                 },
+             }
+         else:
+             out = {
+                 "text": "",
++                "output_ids": [],
+                 "meta_info": {
+                     "id": origin_rid,
+                     "finish_reason": {
+@@ -1747,6 +1761,11 @@ class TokenizerManager(TokenizerCommunicatorMixin):
+                     },
+                     "prompt_tokens": 0,
+                     "completion_tokens": 0,
++                    "model_version": self.server_args.weight_version,
++                    "cached_tokens": 0,
++                    "e2e_latency": 0,
++                    "output_token_logprobs": [[]],
++                    "input_token_logprobs": [[]],
+                 },
+             }
+         state.out_list.append(out)
 diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
 index 0a1cededd..0093fe2a8 100644
 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py
diff --git a/scripts/run-glm4.5-355B-A32B.sh b/scripts/run-glm4.5-355B-A32B.sh
@@ -50,6 +50,9 @@ ROLLOUT_ARGS=(
    --num-steps-per-rollout 4
    --balance-data
    --rollout-stop-token-ids 151329 151336 151338
+
+   # fault tolerance settings
+   --rollout-health-check-first-wait 300
 )
 
 EVAL_ARGS=(
diff --git a/scripts/run-qwen3-235B-A22B.sh b/scripts/run-qwen3-235B-A22B.sh
@@ -62,6 +62,9 @@ ROLLOUT_ARGS=(
 
    --global-batch-size 64
    --balance-data
+
+   # fault tolerance settings
+   --rollout-health-check-first-wait 300
 )
 
 EVAL_ARGS=(
diff --git a/slime/ray/rollout.py b/slime/ray/rollout.py
@@ -58,8 +58,10 @@ def __init__(self, args, pg, wandb_run_id):
         # fault tolerance
         self._health_monitor_thread = None
         self._health_monitor_stop_event = None
-        self._health_check_interval = getattr(args, "rollout_health_check_interval", 10.0)
-        self._health_check_timeout = getattr(args, "rollout_health_check_timeout", 5.0)
+        self._health_check_interval = args.rollout_health_check_interval
+        self._health_check_timeout = args.rollout_health_check_timeout
+        self._health_check_is_first = True
+        self._health_check_first_wait = args.rollout_health_check_first_wait
 
     def get_rollout_engines_and_lock(self):
         return self.rollout_engines, self.rollout_engine_lock, self.num_new_engines
@@ -135,6 +137,11 @@ def _stop_health_monitor(self) -> None:
 
     def _health_monitor_loop(self) -> None:
         assert self._health_monitor_stop_event is not None
+        # TODO: need to be waiting for the large moe to be ready. this is hacky.
+        if self._health_check_is_first:
+            if self._health_monitor_stop_event.wait(self._health_check_first_wait):
+                return
+            self._health_check_is_first = False
         while not self._health_monitor_stop_event.is_set():
             self._run_health_checks()
             if self._health_monitor_stop_event.wait(self._health_check_interval):
diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py
@@ -228,6 +228,12 @@ def add_rollout_arguments(parser):
                 default=5.0,
                 help="Timeout in seconds to wait for a rollout engine /health_generate response before killing it.",
             )
+            parser.add_argument(
+                "--rollout-health-check-first-wait",
+                type=float,
+                default=300.0,
+                help="Time to wait for the compilation before the actual health check.",
+            )
 
             # sampling
             parser.add_argument(

Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,9 @@ ROLLOUT_ARGS=(`
`50`	`50`	`--num-steps-per-rollout 4`
`51`	`51`	`--balance-data`
`52`	`52`	`--rollout-stop-token-ids 151329 151336 151338`
	`53`	`+`
	`54`	`+ # fault tolerance settings`
	`55`	`+ --rollout-health-check-first-wait 300`
`53`	`56`	`)`
`54`	`57`
`55`	`58`	`EVAL_ARGS=(`
Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,9 @@ ROLLOUT_ARGS=(`
`62`	`62`
`63`	`63`	`--global-batch-size 64`
`64`	`64`	`--balance-data`
	`65`	`+`
	`66`	`+ # fault tolerance settings`
	`67`	`+ --rollout-health-check-first-wait 300`
`65`	`68`	`)`
`66`	`69`
`67`	`70`	`EVAL_ARGS=(`