Merge pull request #41 from TaekyungHeo/not-enough-nodes

srinivas212 · web-flow · commit 503e2d447b1e · 2024-05-28T19:53:35.000-07:00
Enhance job submission error handling with custom exceptions
diff --git a/src/cloudai/_core/base_runner.py b/src/cloudai/_core/base_runner.py
@@ -24,6 +24,7 @@
 from typing import Dict, List, Optional
 
 from .base_job import BaseJob
+from .exceptions import JobSubmissionError
 from .system import System
 from .test import Test
 from .test_scenario import TestScenario
@@ -167,14 +168,14 @@ async def submit_test(self, test: Test):
             test (Test): The test to be started.
         """
         self.logger.info(f"Starting test: {test.section_name}")
-        job = self._submit_test(test)
-        if job:
+        try:
+            job = self._submit_test(test)
             self.jobs.append(job)
             self.test_to_job_map[test] = job
-        else:
-            msg = f"Failed to run test {test.section_name}"
-            self.logger.error(msg)
-            raise RuntimeError(msg)
+        except JobSubmissionError as e:
+            self.logger.error(e)
+            print(e, file=sys.stdout)
+            sys.exit(1)
 
     async def delayed_submit_test(self, test: Test, delay: int):
         """
@@ -189,17 +190,17 @@ async def delayed_submit_test(self, test: Test, delay: int):
         await self.submit_test(test)
 
     @abstractmethod
-    def _submit_test(self, test: Test) -> Optional[BaseJob]:
+    def _submit_test(self, test: Test) -> BaseJob:
         """
         Execute a given test and returns a job if successful.
 
         Args:
             test (Test): The test to be executed.
 
         Returns:
-            Optional[BaseJob]: A BaseJob object if the test execution is
-                              successful, None otherwise.
+            BaseJob: A BaseJob object
         """
+        return BaseJob
 
     async def check_start_post_init_dependencies(self):
         """
diff --git a/src/cloudai/_core/exceptions.py b/src/cloudai/_core/exceptions.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class JobSubmissionError(Exception):
+    """
+    Exception raised for errors that occur during job submission.
+
+    Attributes
+        test_name (str): The name of the test associated with the job.
+        command (str): The command that was executed to submit the job.
+        stdout (str): The standard output from the command execution.
+        stderr (str): The standard error from the command execution.
+        message (str): A custom message describing the error.
+    """
+
+    def __init__(self, test_name: str, command: str, stdout: str, stderr: str, message: str):
+        """
+        Initialize a JobSubmissionError instance.
+
+        Args:
+            test_name (str): The name of the test associated with the job.
+            command (str): The command that was executed to submit the job.
+            stdout (str): The standard output from the command execution.
+            stderr (str): The standard error from the command execution.
+            message (str): A custom message describing the error.
+        """
+        super().__init__(message)
+        self.test_name = test_name
+        self.command = command
+        self.stdout = stdout.strip()
+        self.stderr = stderr.strip()
+        self.message = message
+
+    def __str__(self):
+        """
+        Return a formatted string representation of the JobSubmissionError instance.
+
+        Returns
+            str: A formatted string with detailed error information.
+        """
+        return (
+            f"\nERROR: Job Submission Failed\n"
+            f"\tTest Name: {self.test_name}\n"
+            f"\tMessage: {self.message}\n"
+            f"\tCommand: '{self.command}'\n"
+            f"\tstdout: '{self.stdout}'\n"
+            f"\tstderr: '{self.stderr}'\n"
+        )
+
+
+class JobIdRetrievalError(JobSubmissionError):
+    """
+    Exception raised when a job ID cannot be retrieved after job submission.
+
+    Attributes
+        Inherits all attributes from JobSubmissionError.
+    """
+
+    pass
diff --git a/src/cloudai/runner/slurm/slurm_runner.py b/src/cloudai/runner/slurm/slurm_runner.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, cast
+from typing import cast
 
 from cloudai._core.base_job import BaseJob
 from cloudai._core.base_runner import BaseRunner
+from cloudai._core.exceptions import JobIdRetrievalError
 from cloudai._core.system import System
 from cloudai._core.test import Test
 from cloudai._core.test_scenario import TestScenario
@@ -56,28 +57,33 @@ def __init__(self, mode: str, system: System, test_scenario: TestScenario) -> No
         self.slurm_system: SlurmSystem = cast(SlurmSystem, system)
         self.cmd_shell = CommandShell()
 
-    def _submit_test(self, test: Test) -> Optional[SlurmJob]:
+    def _submit_test(self, test: Test) -> SlurmJob:
         """
         Submit a test for execution on Slurm and returns a SlurmJob.
 
         Args:
             test (Test): The test to be executed.
 
         Returns:
-            Optional[SlurmJob]: A SlurmJob object if the test execution is
-                                successful, None otherwise.
+            SlurmJob: A SlurmJob object
         """
         self.logger.info(f"Running test: {test.section_name}")
         job_output_path = self.get_job_output_path(test)
         exec_cmd = test.gen_exec_command(job_output_path)
         self.logger.info(f"Executing command for test {test.section_name}: {exec_cmd}")
-        job_id = None
+        job_id = 0
         if self.mode == "run":
             stdout, stderr = self.cmd_shell.execute(exec_cmd).communicate()
             job_id = test.get_job_id(stdout, stderr)
-        else:
-            job_id = 0
-        return SlurmJob(job_id, test) if job_id is not None else None
+            if job_id is None:
+                raise JobIdRetrievalError(
+                    test_name=str(test.section_name),
+                    command=exec_cmd,
+                    stdout=stdout,
+                    stderr=stderr,
+                    message="Failed to retrieve job ID from command output.",
+                )
+        return SlurmJob(job_id, test)
 
     def is_job_running(self, job: BaseJob) -> bool:
         """
diff --git a/src/cloudai/runner/standalone/standalone_runner.py b/src/cloudai/runner/standalone/standalone_runner.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, cast
+from typing import cast
 
 from cloudai._core.base_job import BaseJob
 from cloudai._core.base_runner import BaseRunner
+from cloudai._core.exceptions import JobIdRetrievalError
 from cloudai._core.system import System
 from cloudai._core.test import Test
 from cloudai._core.test_scenario import TestScenario
@@ -55,28 +56,33 @@ def __init__(
         super().__init__(mode, system, test_scenario)
         self.cmd_shell = CommandShell()
 
-    def _submit_test(self, test: Test) -> Optional[StandaloneJob]:
+    def _submit_test(self, test: Test) -> StandaloneJob:
         """
         Submit a test for execution on Standalone and returns a StandaloneJob.
 
         Args:
             test (Test): The test to be executed.
 
         Returns:
-            Optional[StandaloneJob]: A StandaloneJob object if the test execution is
-                                     successful, None otherwise.
+            StandaloneJob: A StandaloneJob object
         """
         self.logger.info(f"Running test: {test.section_name}")
         job_output_path = self.get_job_output_path(test)
         exec_cmd = test.gen_exec_command(job_output_path)
         self.logger.info(f"Executing command for test {test.section_name}: {exec_cmd}")
-        job_id = None
+        job_id = 0
         if self.mode == "run":
             pid = self.cmd_shell.execute(exec_cmd).pid
             job_id = test.get_job_id(str(pid), "")
-        else:
-            job_id = 0
-        return StandaloneJob(job_id, test) if job_id is not None else None
+            if job_id is None:
+                raise JobIdRetrievalError(
+                    test_name=str(test.section_name),
+                    command=exec_cmd,
+                    stdout="",
+                    stderr="",
+                    message="Failed to retrieve job ID from command output.",
+                )
+        return StandaloneJob(job_id, test)
 
     def is_job_running(self, job: BaseJob) -> bool:
         """
diff --git a/tests/test_job_submission_error.py b/tests/test_job_submission_error.py
@@ -0,0 +1,75 @@
+import subprocess
+from unittest.mock import MagicMock, Mock
+
+import pytest
+from cloudai._core.exceptions import JobIdRetrievalError
+from cloudai._core.test import Test
+from cloudai._core.test_scenario import TestScenario
+from cloudai._core.test_template import TestTemplate
+from cloudai.runner.slurm.slurm_runner import SlurmRunner
+from cloudai.schema.system import SlurmSystem
+from cloudai.schema.system.slurm import SlurmNode, SlurmNodeState
+from cloudai.util import CommandShell
+
+
+class MockCommandShell(CommandShell):
+    def execute(self, command):
+        mock_popen = Mock(spec=subprocess.Popen)
+        mock_popen.communicate.return_value = (
+            "",
+            "sbatch: error: Batch job submission failed: Requested node configuration is not available",
+        )
+        return mock_popen
+
+
+class MockTest(Test):
+    def __init__(self, section_name):
+        self.name = "Mock Test"
+        self.description = "A mock test description"
+        self.test_template = MagicMock(spec=TestTemplate)
+        self.env_vars = {}
+        self.cmd_args = {}
+        self.extra_env_vars = {}
+        self.extra_cmd_args = ""
+        self.section_name = "Tests.1"
+        self.current_iteration = 0
+
+    def gen_exec_command(self, output_path):
+        return "sbatch mock_script.sh"
+
+    def get_job_id(self, stdout, stderr):
+        return None
+
+
+@pytest.fixture
+def slurm_system(tmpdir):
+    nodes = [
+        SlurmNode(name="nodeA001", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="nodeB001", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+    ]
+    system = SlurmSystem(
+        name="test_system",
+        install_path=tmpdir,
+        output_path=tmpdir,
+        default_partition="main",
+        partitions={"main": nodes},
+    )
+    return system
+
+
+@pytest.fixture
+def slurm_runner(slurm_system):
+    test_scenario = TestScenario(name="Test Scenario", tests=[MockTest(section_name="Mock Test")])
+    runner = SlurmRunner(mode="run", system=slurm_system, test_scenario=test_scenario)
+    runner.cmd_shell = MockCommandShell()
+    return runner
+
+
+def test_job_id_retrieval_error(slurm_runner):
+    test = slurm_runner.test_scenario.tests[0]
+    with pytest.raises(JobIdRetrievalError) as excinfo:
+        slurm_runner._submit_test(test)
+    assert "Failed to retrieve job ID from command output." in str(excinfo.value)
+    assert "sbatch: error: Batch job submission failed: Requested node configuration is not available" in str(
+        excinfo.value
+    )