Merge pull request #101 from TaekyungHeo/grok-job-status

srinivas212 · web-flow · commit 5b74667ff585 · 2024-06-14T19:34:19.000-07:00
Update JaxToolboxJobStatusRetrievalStrategy to improve error messages
diff --git a/src/cloudai/schema/test_template/jax_toolbox/job_status_retrieval_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/job_status_retrieval_strategy.py
@@ -32,28 +32,50 @@ def get_job_status(self, output_path: str) -> JobStatusResult:
             JobStatusResult: The result containing the job status and an optional error message.
         """
         profile_stderr_path = os.path.join(output_path, "profile_stderr.txt")
+
+        result = self.check_profile_stderr(profile_stderr_path, output_path)
+        if not result.is_successful:
+            return result
+
+        error_files = list(Path(output_path).glob("error-*.txt"))
+        if not error_files:
+            return JobStatusResult(
+                is_successful=False,
+                error_message=(
+                    f"No 'error-*.txt' files found in the output directory, {output_path}. There are two stages in the "
+                    "Grok run. The profiling stage passed successfully, but something went wrong in the actual run "
+                    "stage. Please ensure the actual run stage completed successfully. "
+                    "Run the generated sbatch script manually to debug."
+                ),
+            )
+
+        return self.check_error_files(error_files, output_path)
+
+    def check_profile_stderr(self, profile_stderr_path: str, output_path: str) -> JobStatusResult:
+        """
+        Check the profile_stderr.txt file for known error messages.
+
+        Args:
+            profile_stderr_path (str): Path to the 'profile_stderr.txt' file.
+            output_path (str): Path to the output directory.
+
+        Returns:
+            JobStatusResult: The result containing the job status and an optional error message.
+        """
         if not os.path.isfile(profile_stderr_path):
             return JobStatusResult(
                 is_successful=False,
                 error_message=(
-                    "profile_stderr.txt file not found in the specified output directory. "
-                    "This file is expected to be created during the profiling stage of the Grok run. "
+                    f"profile_stderr.txt file not found in the specified output directory, {output_path}. "
+                    "This file is expected to be created during the profiling stage. "
                     "Please ensure the profiling stage completed successfully. "
                     "Run the generated sbatch script manually to debug."
                 ),
             )
 
         with open(profile_stderr_path, "r") as file:
             content = file.read()
-            if "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected" in content:
-                return JobStatusResult(
-                    is_successful=False,
-                    error_message=(
-                        "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
-                        "environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
-                        "Please ensure the environment variables are set correctly and try again."
-                    ),
-                )
+
             if "[PAX STATUS]: E2E time: Elapsed time for " not in content:
                 return JobStatusResult(
                     is_successful=False,
@@ -65,36 +87,81 @@ def get_job_status(self, output_path: str) -> JobStatusResult:
                     ),
                 )
 
-        error_files = list(Path(output_path).glob("error-*.txt"))
-        if not error_files:
+            result = self.check_common_errors(content, profile_stderr_path, output_path)
+            if not result.is_successful:
+                return result
+
+        return JobStatusResult(is_successful=True)
+
+    def check_common_errors(self, content: str, file_path: str, output_path: str) -> JobStatusResult:
+        """
+        Check for common errors in the file content.
+
+        Args:
+            content (str): The content of the file to check.
+            file_path (str): The path of the file being checked.
+            output_path (str): Path to the output directory.
+
+        Returns:
+            JobStatusResult: The result containing the job status and an optional error message.
+        """
+        if "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected" in content:
             return JobStatusResult(
                 is_successful=False,
                 error_message=(
-                    "No 'error-*.txt' files found in the output directory. There are two stages in the Grok run. "
-                    "The profiling stage passed successfully, but something went wrong in the actual run stage. "
-                    "Please ensure the actual run stage completed successfully. "
-                    "Run the generated sbatch script manually to debug."
+                    "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
+                    "environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
+                    "First, check if GPUs are available on the server. "
+                    "Second, if running the job with Slurm, ensure proper resource-related options are set, "
+                    "including GPU resource requirements. Lastly, check environment variables. "
+                    "If the problem persists, verify commands and environment variables by running a simple GPU-only "
+                    "example command."
+                ),
+            )
+        if "Terminating process because the coordinator detected missing heartbeats" in content:
+            return JobStatusResult(
+                is_successful=False,
+                error_message=(
+                    "Terminating process because the coordinator detected missing heartbeats. This most likely "
+                    f"indicates that another task died. Please review the file at {file_path} and any relevant logs in"
+                    f" {output_path}. Ensure the servers allocated for this task can reach each other with their "
+                    "hostnames, and they can open any ports and reach others' ports."
                 ),
             )
+        if "NCCL operation ncclGroupEnd() failed" in content:
+            return JobStatusResult(
+                is_successful=False,
+                error_message=(
+                    "NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test "
+                    "passes. Run with NCCL_DEBUG=INFO for more details."
+                ),
+            )
+
+        return JobStatusResult(is_successful=True)
+
+    def check_error_files(self, error_files: list, output_path: str) -> JobStatusResult:
+        """
+        Check the error-*.txt files for known error messages.
+
+        Args:
+            error_files (list): List of paths to error files.
+            output_path (str): Path to the output directory.
 
+        Returns:
+            JobStatusResult: The result containing the job status and an optional error message.
+        """
         for error_file in error_files:
             with open(error_file, "r") as file:
                 content = file.read()
-                if "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected" in content:
-                    return JobStatusResult(
-                        is_successful=False,
-                        error_message=(
-                            "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
-                            "environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
-                            "Please ensure the environment variables are set correctly and try again."
-                        ),
-                    )
+                result = self.check_common_errors(content, error_file, output_path)
+                if not result.is_successful:
+                    return result
                 if "E2E time: Elapsed time for" not in content:
                     return JobStatusResult(
                         is_successful=False,
                         error_message=(
-                            f"The file {error_file} does not contain the expected 'E2E time: Elapsed time for' keyword "
-                            "at the end. This indicates the actual run did not complete successfully. "
+                            f"The file {error_file} does not contain the expected 'E2E time: Elapsed time for' "
+                            "keyword at the end. This indicates the actual run did not complete successfully. "
                             "Please debug this manually to ensure the actual run stage completes as expected."
                         ),
                     )
diff --git a/tests/test_job_status_retrieval_strategy.py b/tests/test_job_status_retrieval_strategy.py
@@ -125,8 +125,8 @@ def test_no_profile_stderr_file(self, tmp_path: Path) -> None:
         result = self.js.get_job_status(str(tmp_path))
         assert not result.is_successful
         assert result.error_message == (
-            "profile_stderr.txt file not found in the specified output directory. "
-            "This file is expected to be created during the profiling stage of the Grok run. "
+            f"profile_stderr.txt file not found in the specified output directory, {str(tmp_path)}. "
+            "This file is expected to be created during the profiling stage. "
             "Please ensure the profiling stage completed successfully. "
             "Run the generated sbatch script manually to debug."
         )
@@ -139,10 +139,9 @@ def test_missing_pax_status_keyword(self, tmp_path: Path) -> None:
         result = self.js.get_job_status(str(tmp_path))
         assert not result.is_successful
         assert result.error_message == (
-            "The profiling stage completed but did not generate the expected "
-            "'[PAX STATUS]: E2E time: Elapsed time for ' "
-            "keyword. There are two stages in the Grok run, and an error occurred in the profiling stage. "
-            "While profile_stderr.txt was created, the expected keyword is missing. "
+            "The profiling stage completed but did not generate the expected '[PAX STATUS]: E2E time: "
+            "Elapsed time for ' keyword. There are two stages in the Grok run, and an error occurred in "
+            "the profiling stage. While profile_stderr.txt was created, the expected keyword is missing. "
             "You need to run the sbatch script manually to see what happens."
         )
 
@@ -154,24 +153,29 @@ def test_no_error_files(self, tmp_path: Path) -> None:
         result = self.js.get_job_status(str(tmp_path))
         assert not result.is_successful
         assert result.error_message == (
-            "No 'error-*.txt' files found in the output directory. There are two stages in the Grok run. "
-            "The profiling stage passed successfully, but something went wrong in the actual run stage. "
+            f"No 'error-*.txt' files found in the output directory, {str(tmp_path)}. There are two stages in the Grok "
+            "run. The profiling stage passed successfully, but something went wrong in the actual run stage. "
             "Please ensure the actual run stage completed successfully. "
             "Run the generated sbatch script manually to debug."
         )
 
     def test_cuda_no_device_error_in_profile_stderr(self, tmp_path: Path) -> None:
         """Test that job status is False when profile_stderr.txt contains CUDA_ERROR_NO_DEVICE."""
         profile_stderr_file = tmp_path / "profile_stderr.txt"
-        profile_stderr_content = "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected"
+        profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n"
+        profile_stderr_content += "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected"
         profile_stderr_file.write_text(profile_stderr_content)
 
         result = self.js.get_job_status(str(tmp_path))
         assert not result.is_successful
         assert result.error_message == (
             "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
             "environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
-            "Please ensure the environment variables are set correctly and try again."
+            "First, check if GPUs are available on the server. "
+            "Second, if running the job with Slurm, ensure proper resource-related options are set, "
+            "including GPU resource requirements. Lastly, check environment variables. "
+            "If the problem persists, verify commands and environment variables by running a simple GPU-only "
+            "example command."
         )
 
     def test_missing_e2e_time_keyword(self, tmp_path: Path) -> None:
@@ -187,8 +191,8 @@ def test_missing_e2e_time_keyword(self, tmp_path: Path) -> None:
         result = self.js.get_job_status(str(tmp_path))
         assert not result.is_successful
         assert result.error_message == (
-            f"The file {error_file} does not contain the expected 'E2E time: Elapsed time for' keyword at the end. "
-            "This indicates the actual run did not complete successfully. "
+            f"The file {str(error_file)} does not contain the expected 'E2E time: Elapsed time for' keyword at the "
+            "end. This indicates the actual run did not complete successfully. "
             "Please debug this manually to ensure the actual run stage completes as expected."
         )
 
@@ -207,7 +211,11 @@ def test_cuda_no_device_error_in_error_file(self, tmp_path: Path) -> None:
         assert result.error_message == (
             "CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected. This may be due to missing "
             "environment variables, specifically but not limited to CUDA_VISIBLE_DEVICES. "
-            "Please ensure the environment variables are set correctly and try again."
+            "First, check if GPUs are available on the server. "
+            "Second, if running the job with Slurm, ensure proper resource-related options are set, "
+            "including GPU resource requirements. Lastly, check environment variables. "
+            "If the problem persists, verify commands and environment variables by running a simple GPU-only "
+            "example command."
         )
 
     def test_successful_job(self, tmp_path: Path) -> None:
@@ -223,3 +231,69 @@ def test_successful_job(self, tmp_path: Path) -> None:
         result = self.js.get_job_status(str(tmp_path))
         assert result.is_successful
         assert result.error_message == ""
+
+    def test_nccl_group_end_error_in_profile_stderr(self, tmp_path: Path) -> None:
+        """Test that job status is False when profile_stderr.txt contains NCCL operation ncclGroupEnd() failed."""
+        profile_stderr_file = tmp_path / "profile_stderr.txt"
+        profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n"
+        profile_stderr_content += "NCCL operation ncclGroupEnd() failed: unhandled system error"
+        profile_stderr_file.write_text(profile_stderr_content)
+
+        result = self.js.get_job_status(str(tmp_path))
+        assert not result.is_successful
+        assert result.error_message == (
+            "NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test "
+            "passes. Run with NCCL_DEBUG=INFO for more details."
+        )
+
+    def test_nccl_group_end_error_in_error_file(self, tmp_path: Path) -> None:
+        """Test that job status is False when error-*.txt contains NCCL operation ncclGroupEnd() failed."""
+        profile_stderr_file = tmp_path / "profile_stderr.txt"
+        profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling"
+        profile_stderr_file.write_text(profile_stderr_content)
+
+        error_file = tmp_path / "error-1.txt"
+        error_content = "NCCL operation ncclGroupEnd() failed: unhandled system error"
+        error_file.write_text(error_content)
+
+        result = self.js.get_job_status(str(tmp_path))
+        assert not result.is_successful
+        assert result.error_message == (
+            "NCCL operation ncclGroupEnd() failed: unhandled system error. Please check if the NCCL-test "
+            "passes. Run with NCCL_DEBUG=INFO for more details."
+        )
+
+    def test_heartbeat_error_in_profile_stderr(self, tmp_path: Path) -> None:
+        """Test that job status is False when profile_stderr.txt contains coordinator detected missing heartbeats."""
+        profile_stderr_file = tmp_path / "profile_stderr.txt"
+        profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling\n"
+        profile_stderr_content += "Terminating process because the coordinator detected missing heartbeats"
+        profile_stderr_file.write_text(profile_stderr_content)
+
+        result = self.js.get_job_status(str(tmp_path))
+        assert not result.is_successful
+        assert result.error_message == (
+            "Terminating process because the coordinator detected missing heartbeats. This most likely "
+            f"indicates that another task died. Please review the file at {str(profile_stderr_file)} and any relevant "
+            f"logs in {str(tmp_path)}. Ensure the servers allocated for this task can reach each other with their "
+            "hostnames, and they can open any ports and reach others' ports."
+        )
+
+    def test_heartbeat_error_in_error_file(self, tmp_path: Path) -> None:
+        """Test that job status is False when error-*.txt contains coordinator detected missing heartbeats."""
+        profile_stderr_file = tmp_path / "profile_stderr.txt"
+        profile_stderr_content = "[PAX STATUS]: E2E time: Elapsed time for profiling"
+        profile_stderr_file.write_text(profile_stderr_content)
+
+        error_file = tmp_path / "error-1.txt"
+        error_content = "Terminating process because the coordinator detected missing heartbeats"
+        error_file.write_text(error_content)
+
+        result = self.js.get_job_status(str(tmp_path))
+        assert not result.is_successful
+        assert result.error_message == (
+            "Terminating process because the coordinator detected missing heartbeats. This most likely "
+            f"indicates that another task died. Please review the file at {str(error_file)} and any relevant logs in"
+            f" {str(tmp_path)}. Ensure the servers allocated for this task can reach each other with their "
+            "hostnames, and they can open any ports and reach others' ports."
+        )