Skip to content

Commit 5b74667

Browse files
authored
Merge pull request #101 from TaekyungHeo/grok-job-status
Update JaxToolboxJobStatusRetrievalStrategy to improve error messages
2 parents 883a530 + cf374f5 commit 5b74667

File tree

2 files changed

+182
-41
lines changed

2 files changed

+182
-41
lines changed

src/cloudai/schema/test_template/jax_toolbox/job_status_retrieval_strategy.py

Lines changed: 95 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -32,28 +32,50 @@ def get_job_status(self, output_path: str) -> JobStatusResult:
3232
JobStatusResult: The result containing the job status and an optional error message.
3333
"""
3434
profile_stderr_path = os.path.join(output_path, "profile_stderr.txt")
35+
36+
result = self.check_profile_stderr(profile_stderr_path, output_path)
37+
if not result.is_successful:
38+
return result
39+
40+
error_files = list(Path(output_path).glob("error-*.txt"))
41+
if not error_files:
42+
return JobStatusResult(
43+
is_successful=False,
44+
error_message=(
45+
f"No 'error-*.txt' files found in the output directory, {output_path}. There are two stages in the "
46+
"Grok run. The profiling stage passed successfully, but something went wrong in the actual run "
47+
"stage. Please ensure the actual run stage completed successfully. "
48+
"Run the generated sbatch script manually to debug."
49+
),
50+
)
51+
52+
return self.check_error_files(error_files, output_path)
53+
54+
def check_profile_stderr(self, profile_stderr_path: str, output_path: str) -> JobStatusResult:
55+
"""
56+
Check the profile_stderr.txt file for known error messages.
57+
58+
Args:
59+
profile_stderr_path (str): Path to the 'profile_stderr.txt' file.
60+
output_path (str): Path to the output directory.
61+
62+
Returns:
63+
JobStatusResult: The result containing the job status and an optional error message.
64+
"""
3565
if not os.path.isfile(profile_stderr_path):
3666
return JobStatusResult(
3767
is_successful=False,
3868
error_message=(
39-
"profile_stderr.txt file not found in the specified output directory. "
40-
"This file is expected to be created during the profiling stage of the Grok run. "
69+
f"profile_stderr.txt file not found in the specified output directory, {output_path}. "
70+
"This file is expected to be created during the profiling stage. "
4171
"Please ensure the profiling stage completed successfully. "
4272
"Run the generated sbatch script manually to debug."
4373
),
4474
)
4575

4676
with open(profile_stderr_path, "r") as file:
4777
content = file.read()
48-
if "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected" in content:
49-
return JobStatusResult(
50-
is_successful=False,
51-
error_message=(
52-
"CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
53-
"environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
54-
"Please ensure the environment variables are set correctly and try again."
55-
),
56-
)
78+
5779
if "[PAX STATUS]: E2E time: Elapsed time for " not in content:
5880
return JobStatusResult(
5981
is_successful=False,
@@ -65,36 +87,81 @@ def get_job_status(self, output_path: str) -> JobStatusResult:
6587
),
6688
)
6789

68-
error_files = list(Path(output_path).glob("error-*.txt"))
69-
if not error_files:
90+
result = self.check_common_errors(content, profile_stderr_path, output_path)
91+
if not result.is_successful:
92+
return result
93+
94+
return JobStatusResult(is_successful=True)
95+
96+
def check_common_errors(self, content: str, file_path: str, output_path: str) -> JobStatusResult:
97+
"""
98+
Check for common errors in the file content.
99+
100+
Args:
101+
content (str): The content of the file to check.
102+
file_path (str): The path of the file being checked.
103+
output_path (str): Path to the output directory.
104+
105+
Returns:
106+
JobStatusResult: The result containing the job status and an optional error message.
107+
"""
108+
if "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected" in content:
70109
return JobStatusResult(
71110
is_successful=False,
72111
error_message=(
73-
"No 'error-*.txt' files found in the output directory. There are two stages in the Grok run. "
74-
"The profiling stage passed successfully, but something went wrong in the actual run stage. "
75-
"Please ensure the actual run stage completed successfully. "
76-
"Run the generated sbatch script manually to debug."
112+
"CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
113+
"environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
114+
"First, check if GPUs are available on the server. "
115+
"Second, if running the job with Slurm, ensure proper resource-related options are set, "
116+
"including GPU resource requirements. Lastly, check environment variables. "
117+
"If the problem persists, verify commands and environment variables by running a simple GPU-only "
118+
"example command."
119+
),
120+
)
121+
if "Terminating process because the coordinator detected missing heartbeats" in content:
122+
return JobStatusResult(
123+
is_successful=False,
124+
error_message=(
125+
"Terminating process because the coordinator detected missing heartbeats. This most likely "
126+
f"indicates that another task died. Please review the file at {file_path} and any relevant logs in"
127+
f" {output_path}. Ensure the servers allocated for this task can reach each other with their "
128+
"hostnames, and they can open any ports and reach others' ports."
77129
),
78130
)
131+
if "NCCL operation ncclGroupEnd() failed" in content:
132+
return JobStatusResult(
133+
is_successful=False,
134+
error_message=(
135+
"NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test "
136+
"passes. Run with NCCL_DEBUG=INFO for more details."
137+
),
138+
)
139+
140+
return JobStatusResult(is_successful=True)
141+
142+
def check_error_files(self, error_files: list, output_path: str) -> JobStatusResult:
143+
"""
144+
Check the error-*.txt files for known error messages.
145+
146+
Args:
147+
error_files (list): List of paths to error files.
148+
output_path (str): Path to the output directory.
79149
150+
Returns:
151+
JobStatusResult: The result containing the job status and an optional error message.
152+
"""
80153
for error_file in error_files:
81154
with open(error_file, "r") as file:
82155
content = file.read()
83-
if "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected" in content:
84-
return JobStatusResult(
85-
is_successful=False,
86-
error_message=(
87-
"CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
88-
"environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
89-
"Please ensure the environment variables are set correctly and try again."
90-
),
91-
)
156+
result = self.check_common_errors(content, error_file, output_path)
157+
if not result.is_successful:
158+
return result
92159
if "E2E time: Elapsed time for" not in content:
93160
return JobStatusResult(
94161
is_successful=False,
95162
error_message=(
96-
f"The file {error_file} does not contain the expected 'E2E time: Elapsed time for' keyword "
97-
"at the end. This indicates the actual run did not complete successfully. "
163+
f"The file {error_file} does not contain the expected 'E2E time: Elapsed time for' "
164+
"keyword at the end. This indicates the actual run did not complete successfully. "
98165
"Please debug this manually to ensure the actual run stage completes as expected."
99166
),
100167
)

tests/test_job_status_retrieval_strategy.py

Lines changed: 87 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@ def test_no_profile_stderr_file(self, tmp_path: Path) -> None:
125125
result = self.js.get_job_status(str(tmp_path))
126126
assert not result.is_successful
127127
assert result.error_message == (
128-
"profile_stderr.txt file not found in the specified output directory. "
129-
"This file is expected to be created during the profiling stage of the Grok run. "
128+
f"profile_stderr.txt file not found in the specified output directory, {str(tmp_path)}. "
129+
"This file is expected to be created during the profiling stage. "
130130
"Please ensure the profiling stage completed successfully. "
131131
"Run the generated sbatch script manually to debug."
132132
)
@@ -139,10 +139,9 @@ def test_missing_pax_status_keyword(self, tmp_path: Path) -> None:
139139
result = self.js.get_job_status(str(tmp_path))
140140
assert not result.is_successful
141141
assert result.error_message == (
142-
"The profiling stage completed but did not generate the expected "
143-
"'[PAX STATUS]: E2E time: Elapsed time for ' "
144-
"keyword. There are two stages in the Grok run, and an error occurred in the profiling stage. "
145-
"While profile_stderr.txt was created, the expected keyword is missing. "
142+
"The profiling stage completed but did not generate the expected '[PAX STATUS]: E2E time: "
143+
"Elapsed time for ' keyword. There are two stages in the Grok run, and an error occurred in "
144+
"the profiling stage. While profile_stderr.txt was created, the expected keyword is missing. "
146145
"You need to run the sbatch script manually to see what happens."
147146
)
148147

@@ -154,24 +153,29 @@ def test_no_error_files(self, tmp_path: Path) -> None:
154153
result = self.js.get_job_status(str(tmp_path))
155154
assert not result.is_successful
156155
assert result.error_message == (
157-
"No 'error-*.txt' files found in the output directory. There are two stages in the Grok run. "
158-
"The profiling stage passed successfully, but something went wrong in the actual run stage. "
156+
f"No 'error-*.txt' files found in the output directory, {str(tmp_path)}. There are two stages in the Grok "
157+
"run. The profiling stage passed successfully, but something went wrong in the actual run stage. "
159158
"Please ensure the actual run stage completed successfully. "
160159
"Run the generated sbatch script manually to debug."
161160
)
162161

163162
def test_cuda_no_device_error_in_profile_stderr(self, tmp_path: Path) -> None:
164163
"""Test that job status is False when profile_stderr.txt contains CUDA_ERROR_NO_DEVICE."""
165164
profile_stderr_file = tmp_path / "profile_stderr.txt"
166-
profile_stderr_content = "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected"
165+
profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n"
166+
profile_stderr_content += "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected"
167167
profile_stderr_file.write_text(profile_stderr_content)
168168

169169
result = self.js.get_job_status(str(tmp_path))
170170
assert not result.is_successful
171171
assert result.error_message == (
172172
"CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
173173
"environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
174-
"Please ensure the environment variables are set correctly and try again."
174+
"First, check if GPUs are available on the server. "
175+
"Second, if running the job with Slurm, ensure proper resource-related options are set, "
176+
"including GPU resource requirements. Lastly, check environment variables. "
177+
"If the problem persists, verify commands and environment variables by running a simple GPU-only "
178+
"example command."
175179
)
176180

177181
def test_missing_e2e_time_keyword(self, tmp_path: Path) -> None:
@@ -187,8 +191,8 @@ def test_missing_e2e_time_keyword(self, tmp_path: Path) -> None:
187191
result = self.js.get_job_status(str(tmp_path))
188192
assert not result.is_successful
189193
assert result.error_message == (
190-
f"The file {error_file} does not contain the expected 'E2E time: Elapsed time for' keyword at the end. "
191-
"This indicates the actual run did not complete successfully. "
194+
f"The file {str(error_file)} does not contain the expected 'E2E time: Elapsed time for' keyword at the "
195+
"end. This indicates the actual run did not complete successfully. "
192196
"Please debug this manually to ensure the actual run stage completes as expected."
193197
)
194198

@@ -207,7 +211,11 @@ def test_cuda_no_device_error_in_error_file(self, tmp_path: Path) -> None:
207211
assert result.error_message == (
208212
"CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
209213
"environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
210-
"Please ensure the environment variables are set correctly and try again."
214+
"First, check if GPUs are available on the server. "
215+
"Second, if running the job with Slurm, ensure proper resource-related options are set, "
216+
"including GPU resource requirements. Lastly, check environment variables. "
217+
"If the problem persists, verify commands and environment variables by running a simple GPU-only "
218+
"example command."
211219
)
212220

213221
def test_successful_job(self, tmp_path: Path) -> None:
@@ -223,3 +231,69 @@ def test_successful_job(self, tmp_path: Path) -> None:
223231
result = self.js.get_job_status(str(tmp_path))
224232
assert result.is_successful
225233
assert result.error_message == ""
234+
235+
def test_nccl_group_end_error_in_profile_stderr(self, tmp_path: Path) -> None:
236+
"""Test that job status is False when profile_stderr.txt contains NCCL operation ncclGroupEnd() failed."""
237+
profile_stderr_file = tmp_path / "profile_stderr.txt"
238+
profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n"
239+
profile_stderr_content += "NCCL operation ncclGroupEnd() failed: unhandled system error"
240+
profile_stderr_file.write_text(profile_stderr_content)
241+
242+
result = self.js.get_job_status(str(tmp_path))
243+
assert not result.is_successful
244+
assert result.error_message == (
245+
"NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test "
246+
"passes. Run with NCCL_DEBUG=INFO for more details."
247+
)
248+
249+
def test_nccl_group_end_error_in_error_file(self, tmp_path: Path) -> None:
250+
"""Test that job status is False when error-*.txt contains NCCL operation ncclGroupEnd() failed."""
251+
profile_stderr_file = tmp_path / "profile_stderr.txt"
252+
profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling"
253+
profile_stderr_file.write_text(profile_stderr_content)
254+
255+
error_file = tmp_path / "error-1.txt"
256+
error_content = "NCCL operation ncclGroupEnd() failed: unhandled system error"
257+
error_file.write_text(error_content)
258+
259+
result = self.js.get_job_status(str(tmp_path))
260+
assert not result.is_successful
261+
assert result.error_message == (
262+
"NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test "
263+
"passes. Run with NCCL_DEBUG=INFO for more details."
264+
)
265+
266+
def test_heartbeat_error_in_profile_stderr(self, tmp_path: Path) -> None:
267+
"""Test that job status is False when profile_stderr.txt contains coordinator detected missing heartbeats."""
268+
profile_stderr_file = tmp_path / "profile_stderr.txt"
269+
profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n"
270+
profile_stderr_content += "Terminating process because the coordinator detected missing heartbeats"
271+
profile_stderr_file.write_text(profile_stderr_content)
272+
273+
result = self.js.get_job_status(str(tmp_path))
274+
assert not result.is_successful
275+
assert result.error_message == (
276+
"Terminating process because the coordinator detected missing heartbeats. This most likely "
277+
f"indicates that another task died. Please review the file at {str(profile_stderr_file)} and any relevant "
278+
f"logs in {str(tmp_path)}. Ensure the servers allocated for this task can reach each other with their "
279+
"hostnames, and they can open any ports and reach others' ports."
280+
)
281+
282+
def test_heartbeat_error_in_error_file(self, tmp_path: Path) -> None:
283+
"""Test that job status is False when error-*.txt contains coordinator detected missing heartbeats."""
284+
profile_stderr_file = tmp_path / "profile_stderr.txt"
285+
profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling"
286+
profile_stderr_file.write_text(profile_stderr_content)
287+
288+
error_file = tmp_path / "error-1.txt"
289+
error_content = "Terminating process because the coordinator detected missing heartbeats"
290+
error_file.write_text(error_content)
291+
292+
result = self.js.get_job_status(str(tmp_path))
293+
assert not result.is_successful
294+
assert result.error_message == (
295+
"Terminating process because the coordinator detected missing heartbeats. This most likely "
296+
f"indicates that another task died. Please review the file at {str(error_file)} and any relevant logs in"
297+
f" {str(tmp_path)}. Ensure the servers allocated for this task can reach each other with their "
298+
"hostnames, and they can open any ports and reach others' ports."
299+
)

0 commit comments

Comments
 (0)