Skip to content

Commit 883a530

Browse files
authored
Merge pull request #100 from TaekyungHeo/nccl-job-status
Update NcclTestJobStatusRetrievalStrategy to improve error messages
2 parents ed582c4 + 48ea03f commit 883a530

File tree

2 files changed

+80
-7
lines changed

2 files changed

+80
-7
lines changed

src/cloudai/schema/test_template/nccl_test/job_status_retrieval_strategy.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,26 +34,57 @@ def get_job_status(self, output_path: str) -> JobStatusResult:
3434
if os.path.isfile(stdout_path):
3535
with open(stdout_path, "r") as file:
3636
content = file.read()
37+
38+
# Check for specific error patterns
39+
if "Test NCCL failure" in content:
40+
return JobStatusResult(
41+
is_successful=False,
42+
error_message=(
43+
f"NCCL test failure detected in {stdout_path}. "
44+
"Possible reasons include network errors or remote process exits. "
45+
"Please review the NCCL test output and errors in the file first. "
46+
"If the issue persists, contact the system administrator."
47+
),
48+
)
49+
if "Test failure" in content:
50+
return JobStatusResult(
51+
is_successful=False,
52+
error_message=(
53+
f"Test failure detected in {stdout_path}. "
54+
"Please review the specific test failure messages in the file. "
55+
"Ensure that the NCCL test environment is correctly set up and configured. "
56+
"If the issue persists, contact the system administrator."
57+
),
58+
)
59+
60+
# Check for success indicators
3761
if "# Out of bounds values" in content and "# Avg bus bandwidth" in content:
3862
return JobStatusResult(is_successful=True)
63+
64+
# Identify missing success indicators
3965
missing_indicators = []
4066
if "# Out of bounds values" not in content:
4167
missing_indicators.append("'# Out of bounds values'")
4268
if "# Avg bus bandwidth" not in content:
4369
missing_indicators.append("'# Avg bus bandwidth'")
70+
4471
error_message = (
4572
f"Missing success indicators in {stdout_path}: {', '.join(missing_indicators)}. "
4673
"These keywords are expected to be present in stdout.txt, usually towards the end of the file. "
47-
f"Please ensure the NCCL test ran to completion. You can run the generated sbatch script manually "
48-
f"and check if {stdout_path} is created and contains the expected keywords."
74+
"Please review the NCCL test output and errors in the file. "
75+
"Ensure the NCCL test ran to completion. You can run the generated sbatch script manually "
76+
f"and check if {stdout_path} is created and contains the expected keywords. "
77+
"If the issue persists, contact the system administrator."
4978
)
5079
return JobStatusResult(is_successful=False, error_message=error_message)
80+
5181
return JobStatusResult(
5282
is_successful=False,
5383
error_message=(
5484
f"stdout.txt file not found in the specified output directory {output_path}. "
5585
"This file is expected to be created as a result of the NCCL test run. "
5686
"Please ensure the NCCL test was executed properly and that stdout.txt is generated. "
57-
f"You can run the generated NCCL test command manually and verify the creation of {stdout_path}."
87+
f"You can run the generated NCCL test command manually and verify the creation of {stdout_path}. "
88+
"If the issue persists, contact the system administrator."
5889
),
5990
)

tests/test_job_status_retrieval_strategy.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ def test_no_stdout_file(self, tmp_path: Path) -> None:
3333
f"stdout.txt file not found in the specified output directory {tmp_path}. "
3434
"This file is expected to be created as a result of the NCCL test run. "
3535
"Please ensure the NCCL test was executed properly and that stdout.txt is generated. "
36-
f"You can run the generated NCCL test command manually and verify the creation of "
37-
f"{tmp_path / 'stdout.txt'}."
36+
"You can run the generated NCCL test command manually and verify the creation of "
37+
f"{tmp_path / 'stdout.txt'}. If the issue persists, contact the system administrator."
3838
)
3939

4040
def test_successful_job(self, tmp_path: Path) -> None:
@@ -66,8 +66,50 @@ def test_failed_job(self, tmp_path: Path) -> None:
6666
assert result.error_message == (
6767
f"Missing success indicators in {stdout_file}: '# Out of bounds values', '# Avg bus bandwidth'. "
6868
"These keywords are expected to be present in stdout.txt, usually towards the end of the file. "
69-
f"Please ensure the NCCL test ran to completion. You can run the generated sbatch script manually "
70-
f"and check if {stdout_file} is created and contains the expected keywords."
69+
f"Please review the NCCL test output and errors in the file. "
70+
"Ensure the NCCL test ran to completion. You can run the generated sbatch script manually "
71+
f"and check if {stdout_file} is created and contains the expected keywords. "
72+
"If the issue persists, contact the system administrator."
73+
)
74+
75+
def test_nccl_failure_job(self, tmp_path: Path) -> None:
76+
"""Test that job status is False when stdout.txt contains NCCL failure indicators."""
77+
stdout_file = tmp_path / "stdout.txt"
78+
stdout_content = """
79+
# Some initialization output
80+
node: Test NCCL failure common.cu:303 'remote process exited or there was a network error / '
81+
.. node pid: Test failure common.cu:401
82+
.. node pid: Test failure common.cu:588
83+
.. node pid: Test failure alltoall.cu:97
84+
.. node pid: Test failure common.cu:615
85+
.. node pid: Test failure common.cu:1019
86+
.. node pid: Test failure common.cu:844
87+
"""
88+
stdout_file.write_text(stdout_content)
89+
result = self.js.get_job_status(str(tmp_path))
90+
assert not result.is_successful
91+
assert result.error_message == (
92+
f"NCCL test failure detected in {stdout_file}. "
93+
"Possible reasons include network errors or remote process exits. "
94+
"Please review the NCCL test output and errors in the file first. "
95+
"If the issue persists, contact the system administrator."
96+
)
97+
98+
def test_generic_test_failure_job(self, tmp_path: Path) -> None:
99+
"""Test that job status is False when stdout.txt contains generic test failure indicators."""
100+
stdout_file = tmp_path / "stdout.txt"
101+
stdout_content = """
102+
# Some initialization output
103+
.. node pid: Test failure common.cu:401
104+
"""
105+
stdout_file.write_text(stdout_content)
106+
result = self.js.get_job_status(str(tmp_path))
107+
assert not result.is_successful
108+
assert result.error_message == (
109+
f"Test failure detected in {stdout_file}. "
110+
"Please review the specific test failure messages in the file. "
111+
"Ensure that the NCCL test environment is correctly set up and configured. "
112+
"If the issue persists, contact the system administrator."
71113
)
72114

73115

0 commit comments

Comments
 (0)