Skip to content

Commit 6b4650d

Browse files
committed
cleaner launcher test logs
1 parent d5365dc commit 6b4650d

File tree

1 file changed

+16
-12
lines changed

1 file changed

+16
-12
lines changed

tests/fault_tolerance/unit/test_launcher.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def _run_launcher(cmd_to_run, timeout):
4747
cmd_to_run,
4848
shell=True,
4949
stdout=subprocess.PIPE,
50+
stderr=subprocess.PIPE,
5051
text=True,
5152
)
5253
stdout, _ = proc.communicate(timeout=timeout)
@@ -78,7 +79,7 @@ def test_rank_not_send_initial_hb(tmp_dir):
7879
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
7980
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1"
8081
launcher_cmd = (
81-
"ft_launcher --monitor-interval=1"
82+
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
8283
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
8384
)
8485
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -97,7 +98,7 @@ def test_rank_failed(tmp_dir):
9798
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
9899
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1"
99100
launcher_cmd = (
100-
"ft_launcher --monitor-interval=1"
101+
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
101102
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
102103
)
103104
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -115,7 +116,7 @@ def test_ranks_exit_gracefully(tmp_dir):
115116
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
116117
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()}"
117118
launcher_cmd = (
118-
"ft_launcher --monitor-interval=1"
119+
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
119120
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
120121
)
121122
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -135,7 +136,7 @@ def test_launcher_sigterm_graceful_exit(tmp_dir):
135136
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
136137
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=return0"
137138
launcher_cmd = (
138-
"ft_launcher --monitor-interval=1"
139+
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
139140
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
140141
)
141142
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -156,7 +157,7 @@ def test_launcher_sigterm_ignored(tmp_dir):
156157
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
157158
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=ignore"
158159
launcher_cmd = (
159-
"ft_launcher --term-timeout=5 --monitor-interval=1"
160+
"PYTHONFAULTHANDLER=1 ft_launcher --term-timeout=5 --monitor-interval=1"
160161
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
161162
)
162163
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -177,7 +178,7 @@ def test_ranks_restart(tmp_dir):
177178
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
178179
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --tmp_dir={tmp_dir}"
179180
launcher_cmd = (
180-
"ft_launcher --max-restarts=2 --monitor-interval=1"
181+
"PYTHONFAULTHANDLER=1 ft_launcher --max-restarts=2 --monitor-interval=1"
181182
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
182183
)
183184
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -199,7 +200,7 @@ def test_missing_cfg(tmp_dir):
199200
# By default, launcher should raise an error if FT config cant be read
200201
cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
201202
launcher_cmd = (
202-
"ft_launcher --monitor-interval=1"
203+
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
203204
f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
204205
)
205206
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -209,7 +210,7 @@ def test_missing_cfg(tmp_dir):
209210
# Empty config file again, But this time there are FT args in CLI, so should be fine
210211
cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
211212
launcher_cmd = (
212-
"ft_launcher --monitor-interval=1"
213+
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
213214
f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} --ft-param-rank_heartbeat_timeout=1.0"
214215
f" {cmd_to_run}"
215216
)
@@ -218,7 +219,7 @@ def test_missing_cfg(tmp_dir):
218219
# Empty config file again, launcher run with `--ignore-missing-fault-tol-cfg` should use defaults
219220
cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
220221
launcher_cmd = (
221-
"ft_launcher --monitor-interval=1"
222+
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
222223
f" --fault-tol-cfg-path={empty_ft_cfg_path} --ignore-missing-fault-tol-cfg"
223224
f" --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
224225
)
@@ -227,7 +228,7 @@ def test_missing_cfg(tmp_dir):
227228
# Invalid config file path - should fail despite --ignore-missing-fault-tol-cfg and FT args specified via CLI
228229
cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
229230
launcher_cmd = (
230-
"ft_launcher --monitor-interval=1"
231+
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
231232
" --fault-tol-cfg-path=/not/there.yaml"
232233
" --ft-param-rank_heartbeat_timeout=1.0"
233234
f" --nproc-per-node={WORLD_SIZE} --ignore-missing-fault-tol-cfg"
@@ -249,7 +250,10 @@ def test_config_provided_via_cli(tmp_dir):
249250
" --ft-param-log_level=WARNING"
250251
)
251252
cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}"
252-
launcher_cmd = "ft_launcher" f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
253+
launcher_cmd = (
254+
"PYTHONFAULTHANDLER=1 ft_launcher"
255+
f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
256+
)
253257
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
254258
assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
255259

@@ -285,7 +289,7 @@ def test_config_provided_via_cli_overwrites_yaml(tmp_dir):
285289
)
286290
cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}"
287291
launcher_cmd = (
288-
"ft_launcher"
292+
"PYTHONFAULTHANDLER=1 ft_launcher"
289293
f" {ft_params_str} --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
290294
)
291295
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)

0 commit comments

Comments
 (0)