Skip to content

Commit 6080ab4

Browse files
zhengkezhou1claude
andcommitted
fix(multi-host): dist init port collision and lost failure signal
- DIST_INIT_PORT 30000 -> 10011 to match the dist-init port published by the headless service and avoid collision with the HTTP server default (any suite leaving ModelRunConfig.port at 30000 would clash). - run_model_run now wraps the full server lifecycle in one try/finally so rank 0 publishes a terminal state on every path, including popen_launch_server failures that previously left workers polling /status until wait_for_done's 60-min timeout. - _publish_state writes exit_code before done so a worker that observes done=True is guaranteed to read the matching exit_code in the same poll, eliminating a race where success was reported despite rank 0 failing. - _reset_state at run entry prevents state from one run leaking into the next. Co-Authored-By: Claude Opus 4.7 <[email protected]>
1 parent 130205b commit 6080ab4

1 file changed

Lines changed: 45 additions & 23 deletions

File tree

test/srt/multi_host/run_suite.py

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,30 @@
2929
dry_run_suite,
3030
)
3131

32-
DIST_INIT_PORT = 30000
32+
DIST_INIT_PORT = 10011
3333
SERVER_PORT = 30000
3434
CONTROL_PORT = 18080
3535

3636
_control_state = {"done": False, "exit_code": 0}
3737

3838

39+
def _publish_state(exit_code: int) -> None:
40+
"""Set terminal state visible to worker ranks via /status.
41+
42+
Writes exit_code BEFORE done so any worker that observes ``done == True``
43+
is guaranteed to read the matching exit_code in the same poll. Without
44+
this ordering a worker can race the two assignments and report success
45+
while rank 0 actually failed.
46+
"""
47+
_control_state["exit_code"] = exit_code
48+
_control_state["done"] = True
49+
50+
51+
def _reset_state() -> None:
52+
_control_state["done"] = False
53+
_control_state["exit_code"] = 0
54+
55+
3956
def _log(message: str) -> None:
4057
print(f"[multi-host-suite] {message}", flush=True)
4158

@@ -223,44 +240,49 @@ def run_model_run(model_run: ModelRun, runtime_cfg: RuntimeConfig) -> int:
223240
_log(
224241
f"Launching model run={model_run.name}, rank={runtime_cfg.node_rank}, port={runtime_cfg.port}"
225242
)
226-
control_server = start_control_server() if runtime_cfg.node_rank == 0 else None
227-
base_url = f"http://{runtime_cfg.host}:{runtime_cfg.port}"
228-
server_process = popen_launch_server(
229-
model=model_run.model.model_path,
230-
base_url=base_url,
231-
timeout=1800,
232-
other_args=build_other_server_args(model_run.model, runtime_cfg),
233-
)
243+
is_rank0 = runtime_cfg.node_rank == 0
244+
_reset_state()
245+
control_server = start_control_server() if is_rank0 else None
246+
server_process = None
234247
exit_code = 0
235248

236249
try:
237-
if runtime_cfg.node_rank == 0:
250+
base_url = f"http://{runtime_cfg.host}:{runtime_cfg.port}"
251+
server_process = popen_launch_server(
252+
model=model_run.model.model_path,
253+
base_url=base_url,
254+
timeout=1800,
255+
other_args=build_other_server_args(model_run.model, runtime_cfg),
256+
)
257+
258+
if is_rank0:
238259
for case in model_run.cases:
239260
run_case(case, model_run.model.model_path, runtime_cfg.port)
240-
_control_state["done"] = True
241-
_control_state["exit_code"] = 0
242261
else:
243262
workload_name = _get_env("WORKLOAD_NAME")
244263
headless_service_name = _get_env("HEADLESS_SERVICE_NAME")
245264
control_url = f"http://{workload_name}-0.{headless_service_name}:{CONTROL_PORT}/status"
246265
exit_code = wait_for_done(control_url, server_process)
247266
except Exception:
248267
exit_code = 1
249-
if runtime_cfg.node_rank == 0:
250-
_control_state["done"] = True
251-
_control_state["exit_code"] = exit_code
252268
raise
253269
finally:
254-
if runtime_cfg.node_rank == 0:
270+
if is_rank0:
271+
# Always publish — covers success, case failure, and popen_launch_server
272+
# failure (where server_process never got assigned). Without this,
273+
# workers spin on /status until wait_for_done's 60-min timeout when
274+
# rank 0 dies during launch.
275+
_publish_state(exit_code)
255276
_log("Keeping control server alive for worker ranks")
256277
time.sleep(30)
257-
_log("Stopping server process")
258-
kill_process_tree(server_process.pid)
259-
try:
260-
server_process.wait(timeout=5)
261-
except subprocess.TimeoutExpired:
262-
server_process.kill()
263-
server_process.wait()
278+
if server_process is not None:
279+
_log("Stopping server process")
280+
kill_process_tree(server_process.pid)
281+
try:
282+
server_process.wait(timeout=5)
283+
except subprocess.TimeoutExpired:
284+
server_process.kill()
285+
server_process.wait()
264286
if control_server is not None:
265287
control_server.shutdown()
266288

0 commit comments

Comments
 (0)