@@ -47,6 +47,7 @@ def _run_launcher(cmd_to_run, timeout):
4747 cmd_to_run ,
4848 shell = True ,
4949 stdout = subprocess .PIPE ,
50+ stderr = subprocess .PIPE ,
5051 text = True ,
5152 )
5253 stdout , _ = proc .communicate (timeout = timeout )
@@ -78,7 +79,7 @@ def test_rank_not_send_initial_hb(tmp_dir):
7879 ft_cfg_path = _save_ft_cfg (ft_cfg , tmp_dir )
7980 cmd_to_run = f"{ _get_util_script_path ()} --scenario={ _get_func_name ()} --which_rank=1"
8081 launcher_cmd = (
81- "ft_launcher --monitor-interval=1"
82+ "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
8283 f" --fault-tol-cfg-path={ ft_cfg_path } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
8384 )
8485 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
@@ -97,7 +98,7 @@ def test_rank_failed(tmp_dir):
9798 ft_cfg_path = _save_ft_cfg (ft_cfg , tmp_dir )
9899 cmd_to_run = f"{ _get_util_script_path ()} --scenario={ _get_func_name ()} --which_rank=1"
99100 launcher_cmd = (
100- "ft_launcher --monitor-interval=1"
101+ "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
101102 f" --fault-tol-cfg-path={ ft_cfg_path } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
102103 )
103104 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
@@ -115,7 +116,7 @@ def test_ranks_exit_gracefully(tmp_dir):
115116 ft_cfg_path = _save_ft_cfg (ft_cfg , tmp_dir )
116117 cmd_to_run = f"{ _get_util_script_path ()} --scenario={ _get_func_name ()} "
117118 launcher_cmd = (
118- "ft_launcher --monitor-interval=1"
119+ "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
119120 f" --fault-tol-cfg-path={ ft_cfg_path } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
120121 )
121122 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
@@ -135,7 +136,7 @@ def test_launcher_sigterm_graceful_exit(tmp_dir):
135136 ft_cfg_path = _save_ft_cfg (ft_cfg , tmp_dir )
136137 cmd_to_run = f"{ _get_util_script_path ()} --scenario={ _get_func_name ()} --term_handler=return0"
137138 launcher_cmd = (
138- "ft_launcher --monitor-interval=1"
139+ "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
139140 f" --fault-tol-cfg-path={ ft_cfg_path } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
140141 )
141142 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
@@ -156,7 +157,7 @@ def test_launcher_sigterm_ignored(tmp_dir):
156157 ft_cfg_path = _save_ft_cfg (ft_cfg , tmp_dir )
157158 cmd_to_run = f"{ _get_util_script_path ()} --scenario={ _get_func_name ()} --term_handler=ignore"
158159 launcher_cmd = (
159- "ft_launcher --term-timeout=5 --monitor-interval=1"
160+ "PYTHONFAULTHANDLER=1 ft_launcher --term-timeout=5 --monitor-interval=1"
160161 f" --fault-tol-cfg-path={ ft_cfg_path } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
161162 )
162163 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
@@ -177,7 +178,7 @@ def test_ranks_restart(tmp_dir):
177178 ft_cfg_path = _save_ft_cfg (ft_cfg , tmp_dir )
178179 cmd_to_run = f"{ _get_util_script_path ()} --scenario={ _get_func_name ()} --tmp_dir={ tmp_dir } "
179180 launcher_cmd = (
180- "ft_launcher --max-restarts=2 --monitor-interval=1"
181+ "PYTHONFAULTHANDLER=1 ft_launcher --max-restarts=2 --monitor-interval=1"
181182 f" --fault-tol-cfg-path={ ft_cfg_path } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
182183 )
183184 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
@@ -199,7 +200,7 @@ def test_missing_cfg(tmp_dir):
199200 # By default, launcher should raise an error if FT config cant be read
200201 cmd_to_run = f"{ _get_util_script_path ()} --scenario=test_ranks_exit_gracefully"
201202 launcher_cmd = (
202- "ft_launcher --monitor-interval=1"
203+ "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
203204 f" --fault-tol-cfg-path={ empty_ft_cfg_path } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
204205 )
205206 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
@@ -209,7 +210,7 @@ def test_missing_cfg(tmp_dir):
209210 # Empty config file again, But this time there are FT args in CLI, so should be fine
210211 cmd_to_run = f"{ _get_util_script_path ()} --scenario=test_ranks_exit_gracefully"
211212 launcher_cmd = (
212- "ft_launcher --monitor-interval=1"
213+ "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
213214 f" --fault-tol-cfg-path={ empty_ft_cfg_path } --nproc-per-node={ WORLD_SIZE } --ft-param-rank_heartbeat_timeout=1.0"
214215 f" { cmd_to_run } "
215216 )
@@ -218,7 +219,7 @@ def test_missing_cfg(tmp_dir):
218219 # Empty config file again, launcher run with `--ignore-missing-fault-tol-cfg` should use defaults
219220 cmd_to_run = f"{ _get_util_script_path ()} --scenario=test_ranks_exit_gracefully"
220221 launcher_cmd = (
221- "ft_launcher --monitor-interval=1"
222+ "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
222223 f" --fault-tol-cfg-path={ empty_ft_cfg_path } --ignore-missing-fault-tol-cfg"
223224 f" --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
224225 )
@@ -227,7 +228,7 @@ def test_missing_cfg(tmp_dir):
227228 # Invalid config file path - should fail despite --ignore-missing-fault-tol-cfg and FT args specified via CLI
228229 cmd_to_run = f"{ _get_util_script_path ()} --scenario=test_ranks_exit_gracefully"
229230 launcher_cmd = (
230- "ft_launcher --monitor-interval=1"
231+ "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
231232 " --fault-tol-cfg-path=/not/there.yaml"
232233 " --ft-param-rank_heartbeat_timeout=1.0"
233234 f" --nproc-per-node={ WORLD_SIZE } --ignore-missing-fault-tol-cfg"
@@ -249,7 +250,10 @@ def test_config_provided_via_cli(tmp_dir):
249250 " --ft-param-log_level=WARNING"
250251 )
251252 cmd_to_run = f"{ _get_util_script_path ()} --scenario=dump_cfg --tmp_dir={ tmp_dir } "
252- launcher_cmd = "ft_launcher" f" { ft_params_str } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
253+ launcher_cmd = (
254+ "PYTHONFAULTHANDLER=1 ft_launcher"
255+ f" { ft_params_str } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
256+ )
253257 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
254258 assert ret_code == 0 , f"Launcher should return with 0. Ret value={ ret_code } . Output=\n { output } "
255259
@@ -285,7 +289,7 @@ def test_config_provided_via_cli_overwrites_yaml(tmp_dir):
285289 )
286290 cmd_to_run = f"{ _get_util_script_path ()} --scenario=dump_cfg --tmp_dir={ tmp_dir } "
287291 launcher_cmd = (
288- "ft_launcher"
292+ "PYTHONFAULTHANDLER=1 ft_launcher"
289293 f" { ft_params_str } --fault-tol-cfg-path={ ft_cfg_path } --nproc-per-node={ WORLD_SIZE } { cmd_to_run } "
290294 )
291295 ret_code , output = _run_launcher (launcher_cmd , DEFAULT_TIMEOUT )
0 commit comments