@@ -125,8 +125,8 @@ def test_no_profile_stderr_file(self, tmp_path: Path) -> None:
125125 result = self .js .get_job_status (str (tmp_path ))
126126 assert not result .is_successful
127127 assert result .error_message == (
128- "profile_stderr.txt file not found in the specified output directory. "
129- "This file is expected to be created during the profiling stage of the Grok run . "
128+ f "profile_stderr.txt file not found in the specified output directory, { str ( tmp_path ) } . "
129+ "This file is expected to be created during the profiling stage. "
130130 "Please ensure the profiling stage completed successfully. "
131131 "Run the generated sbatch script manually to debug."
132132 )
@@ -139,10 +139,9 @@ def test_missing_pax_status_keyword(self, tmp_path: Path) -> None:
139139 result = self .js .get_job_status (str (tmp_path ))
140140 assert not result .is_successful
141141 assert result .error_message == (
142- "The profiling stage completed but did not generate the expected "
143- "'[PAX STATUS]: E2E time: Elapsed time for ' "
144- "keyword. There are two stages in the Grok run, and an error occurred in the profiling stage. "
145- "While profile_stderr.txt was created, the expected keyword is missing. "
142+ "The profiling stage completed but did not generate the expected '[PAX STATUS]: E2E time: "
143+ "Elapsed time for ' keyword. There are two stages in the Grok run, and an error occurred in "
144+ "the profiling stage. While profile_stderr.txt was created, the expected keyword is missing. "
146145 "You need to run the sbatch script manually to see what happens."
147146 )
148147
@@ -154,24 +153,29 @@ def test_no_error_files(self, tmp_path: Path) -> None:
154153 result = self .js .get_job_status (str (tmp_path ))
155154 assert not result .is_successful
156155 assert result .error_message == (
157- "No 'error-*.txt' files found in the output directory. There are two stages in the Grok run. "
158- "The profiling stage passed successfully, but something went wrong in the actual run stage. "
156+ f "No 'error-*.txt' files found in the output directory, { str ( tmp_path ) } . There are two stages in the Grok "
157+ "run. The profiling stage passed successfully, but something went wrong in the actual run stage. "
159158 "Please ensure the actual run stage completed successfully. "
160159 "Run the generated sbatch script manually to debug."
161160 )
162161
163162 def test_cuda_no_device_error_in_profile_stderr (self , tmp_path : Path ) -> None :
164163 """Test that job status is False when profile_stderr.txt contains CUDA_ERROR_NO_DEVICE."""
165164 profile_stderr_file = tmp_path / "profile_stderr.txt"
166- profile_stderr_content = "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected"
165+ profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n "
166+ profile_stderr_content += "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected"
167167 profile_stderr_file .write_text (profile_stderr_content )
168168
169169 result = self .js .get_job_status (str (tmp_path ))
170170 assert not result .is_successful
171171 assert result .error_message == (
172172 "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
173173 "environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
174- "Please ensure the environment variables are set correctly and try again."
174+ "First, check if GPUs are available on the server. "
175+ "Second, if running the job with Slurm, ensure proper resource-related options are set, "
176+ "including GPU resource requirements. Lastly, check environment variables. "
177+ "If the problem persists, verify commands and environment variables by running a simple GPU-only "
178+ "example command."
175179 )
176180
177181 def test_missing_e2e_time_keyword (self , tmp_path : Path ) -> None :
@@ -187,8 +191,8 @@ def test_missing_e2e_time_keyword(self, tmp_path: Path) -> None:
187191 result = self .js .get_job_status (str (tmp_path ))
188192 assert not result .is_successful
189193 assert result .error_message == (
190- f"The file { error_file } does not contain the expected 'E2E time: Elapsed time for' keyword at the end. "
191- "This indicates the actual run did not complete successfully. "
194+ f"The file { str ( error_file ) } does not contain the expected 'E2E time: Elapsed time for' keyword at the "
195+ "end. This indicates the actual run did not complete successfully. "
192196 "Please debug this manually to ensure the actual run stage completes as expected."
193197 )
194198
@@ -207,7 +211,11 @@ def test_cuda_no_device_error_in_error_file(self, tmp_path: Path) -> None:
207211 assert result .error_message == (
208212 "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
209213 "environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
210- "Please ensure the environment variables are set correctly and try again."
214+ "First, check if GPUs are available on the server. "
215+ "Second, if running the job with Slurm, ensure proper resource-related options are set, "
216+ "including GPU resource requirements. Lastly, check environment variables. "
217+ "If the problem persists, verify commands and environment variables by running a simple GPU-only "
218+ "example command."
211219 )
212220
213221 def test_successful_job (self , tmp_path : Path ) -> None :
@@ -223,3 +231,69 @@ def test_successful_job(self, tmp_path: Path) -> None:
223231 result = self .js .get_job_status (str (tmp_path ))
224232 assert result .is_successful
225233 assert result .error_message == ""
234+
235+ def test_nccl_group_end_error_in_profile_stderr (self , tmp_path : Path ) -> None :
236+ """Test that job status is False when profile_stderr.txt contains NCCL operation ncclGroupEnd() failed."""
237+ profile_stderr_file = tmp_path / "profile_stderr.txt"
238+ profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n "
239+ profile_stderr_content += "NCCL operation ncclGroupEnd() failed: unhandled system error"
240+ profile_stderr_file .write_text (profile_stderr_content )
241+
242+ result = self .js .get_job_status (str (tmp_path ))
243+ assert not result .is_successful
244+ assert result .error_message == (
245+ "NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test "
246+ "passes. Run with NCCL_DEBUG=INFO for more details."
247+ )
248+
249+ def test_nccl_group_end_error_in_error_file (self , tmp_path : Path ) -> None :
250+ """Test that job status is False when error-*.txt contains NCCL operation ncclGroupEnd() failed."""
251+ profile_stderr_file = tmp_path / "profile_stderr.txt"
252+ profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling"
253+ profile_stderr_file .write_text (profile_stderr_content )
254+
255+ error_file = tmp_path / "error-1.txt"
256+ error_content = "NCCL operation ncclGroupEnd() failed: unhandled system error"
257+ error_file .write_text (error_content )
258+
259+ result = self .js .get_job_status (str (tmp_path ))
260+ assert not result .is_successful
261+ assert result .error_message == (
262+ "NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test "
263+ "passes. Run with NCCL_DEBUG=INFO for more details."
264+ )
265+
266+ def test_heartbeat_error_in_profile_stderr (self , tmp_path : Path ) -> None :
267+ """Test that job status is False when profile_stderr.txt contains coordinator detected missing heartbeats."""
268+ profile_stderr_file = tmp_path / "profile_stderr.txt"
269+ profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n "
270+ profile_stderr_content += "Terminating process because the coordinator detected missing heartbeats"
271+ profile_stderr_file .write_text (profile_stderr_content )
272+
273+ result = self .js .get_job_status (str (tmp_path ))
274+ assert not result .is_successful
275+ assert result .error_message == (
276+ "Terminating process because the coordinator detected missing heartbeats. This most likely "
277+ f"indicates that another task died. Please review the file at { str (profile_stderr_file )} and any relevant "
278+ f"logs in { str (tmp_path )} . Ensure the servers allocated for this task can reach each other with their "
279+ "hostnames, and they can open any ports and reach others' ports."
280+ )
281+
282+ def test_heartbeat_error_in_error_file (self , tmp_path : Path ) -> None :
283+ """Test that job status is False when error-*.txt contains coordinator detected missing heartbeats."""
284+ profile_stderr_file = tmp_path / "profile_stderr.txt"
285+ profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling"
286+ profile_stderr_file .write_text (profile_stderr_content )
287+
288+ error_file = tmp_path / "error-1.txt"
289+ error_content = "Terminating process because the coordinator detected missing heartbeats"
290+ error_file .write_text (error_content )
291+
292+ result = self .js .get_job_status (str (tmp_path ))
293+ assert not result .is_successful
294+ assert result .error_message == (
295+ "Terminating process because the coordinator detected missing heartbeats. This most likely "
296+ f"indicates that another task died. Please review the file at { str (error_file )} and any relevant logs in"
297+ f" { str (tmp_path )} . Ensure the servers allocated for this task can reach each other with their "
298+ "hostnames, and they can open any ports and reach others' ports."
299+ )
0 commit comments