@@ -33,8 +33,8 @@ def test_no_stdout_file(self, tmp_path: Path) -> None:
3333 f"stdout.txt file not found in the specified output directory { tmp_path } . "
3434 "This file is expected to be created as a result of the NCCL test run. "
3535 "Please ensure the NCCL test was executed properly and that stdout.txt is generated. "
36- f "You can run the generated NCCL test command manually and verify the creation of "
37- f"{ tmp_path / 'stdout.txt' } ."
36+ "You can run the generated NCCL test command manually and verify the creation of "
37+ f"{ tmp_path / 'stdout.txt' } . If the issue persists, contact the system administrator. "
3838 )
3939
4040 def test_successful_job (self , tmp_path : Path ) -> None :
@@ -66,8 +66,50 @@ def test_failed_job(self, tmp_path: Path) -> None:
6666 assert result .error_message == (
6767 f"Missing success indicators in { stdout_file } : '# Out of bounds values', '# Avg bus bandwidth'. "
6868 "These keywords are expected to be present in stdout.txt, usually towards the end of the file. "
69- f"Please ensure the NCCL test ran to completion. You can run the generated sbatch script manually "
70- f"and check if { stdout_file } is created and contains the expected keywords."
69+ f"Please review the NCCL test output and errors in the file. "
70+ "Ensure the NCCL test ran to completion. You can run the generated sbatch script manually "
71+ f"and check if { stdout_file } is created and contains the expected keywords. "
72+ "If the issue persists, contact the system administrator."
73+ )
74+
75+ def test_nccl_failure_job (self , tmp_path : Path ) -> None :
76+ """Test that job status is False when stdout.txt contains NCCL failure indicators."""
77+ stdout_file = tmp_path / "stdout.txt"
78+ stdout_content = """
79+ # Some initialization output
80+ node: Test NCCL failure common.cu:303 'remote process exited or there was a network error / '
81+ .. node pid: Test failure common.cu:401
82+ .. node pid: Test failure common.cu:588
83+ .. node pid: Test failure alltoall.cu:97
84+ .. node pid: Test failure common.cu:615
85+ .. node pid: Test failure common.cu:1019
86+ .. node pid: Test failure common.cu:844
87+ """
88+ stdout_file .write_text (stdout_content )
89+ result = self .js .get_job_status (str (tmp_path ))
90+ assert not result .is_successful
91+ assert result .error_message == (
92+ f"NCCL test failure detected in { stdout_file } . "
93+ "Possible reasons include network errors or remote process exits. "
94+ "Please review the NCCL test output and errors in the file first. "
95+ "If the issue persists, contact the system administrator."
96+ )
97+
98+ def test_generic_test_failure_job (self , tmp_path : Path ) -> None :
99+ """Test that job status is False when stdout.txt contains generic test failure indicators."""
100+ stdout_file = tmp_path / "stdout.txt"
101+ stdout_content = """
102+ # Some initialization output
103+ .. node pid: Test failure common.cu:401
104+ """
105+ stdout_file .write_text (stdout_content )
106+ result = self .js .get_job_status (str (tmp_path ))
107+ assert not result .is_successful
108+ assert result .error_message == (
109+ f"Test failure detected in { stdout_file } . "
110+ "Please review the specific test failure messages in the file. "
111+ "Ensure that the NCCL test environment is correctly set up and configured. "
112+ "If the issue persists, contact the system administrator."
71113 )
72114
73115
0 commit comments