Merge pull request #301 from NVIDIA/am/upd-jobs

amaslenn · web-flow · commit 192713fc7fec · 2024-11-18T18:00:55.000+01:00
Refactor Job classes
diff --git a/src/cloudai/_core/base_job.py b/src/cloudai/_core/base_job.py
@@ -14,73 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pathlib import Path
+from dataclasses import dataclass, field
 from typing import Union
 
-from .system import System
 from .test_scenario import TestRun
 
 
+@dataclass
 class BaseJob:
-    """
-    Base class for representing a job created by executing a test.
+    """Base class for representing a job created by executing a test."""
 
-    Attributes
-        id (Union[str, int]): The unique identifier of the job.
-        mode (str): The mode of the job (e.g., 'run', 'dry-run').
-        system (System): The system in which the job is running.
-        test_run (TestRun): The TestRun instance associated with this job.
-        output_path (Path): The path where the job's output is stored.
-        terminated_by_dependency (bool): Flag to indicate if the job was terminated due to a dependency.
-    """
-
-    def __init__(self, mode: str, system: System, test_run: TestRun):
-        """
-        Initialize a BaseJob instance.
-
-        Args:
-            mode (str): The mode of the job (e.g., 'run', 'dry-run').
-            system (System): The system in which the job is running.
-            test_run (TestRun): The TestRun instance associated with this job.
-        """
-        self.id: Union[str, int] = 0
-        self.mode: str = mode
-        self.system: System = system
-        self.test_run: TestRun = test_run
-        self.output_path: Path = test_run.output_path
-        self.terminated_by_dependency: bool = False
-
-    def is_running(self) -> bool:
-        """
-        Check if the specified job is currently running.
-
-        Returns
-            bool: True if the job is running, False otherwise.
-        """
-        if self.mode == "dry-run":
-            return True
-        return self.system.is_job_running(self)
-
-    def is_completed(self) -> bool:
-        """
-        Check if a job is completed.
-
-        Returns
-            bool: True if the job is completed, False otherwise.
-        """
-        if self.mode == "dry-run":
-            return True
-        return self.system.is_job_completed(self)
-
-    def increment_iteration(self):
-        """Increment the iteration count of the associated test."""
-        self.test_run.current_iteration += 1
-
-    def __repr__(self) -> str:
-        """
-        Return a string representation of the BaseJob instance.
-
-        Returns
-            str: String representation of the job.
-        """
-        return f"BaseJob(id={self.id}, mode={self.mode}, system={self.system.name}, test={self.test_run.test.name})"
+    test_run: TestRun
+    id: Union[str, int]
+    terminated_by_dependency: bool = field(default=False, init=False)
diff --git a/src/cloudai/_core/base_runner.py b/src/cloudai/_core/base_runner.py
@@ -204,7 +204,7 @@ async def check_start_post_init_dependencies(self):
         items = list(self.testrun_to_job_map.items())
 
         for tr, job in items:
-            if job.is_running():
+            if self.system.is_job_running(job):
                 await self.check_and_schedule_start_post_init_dependent_tests(tr)
 
     async def check_and_schedule_start_post_init_dependent_tests(self, started_test_run: TestRun):
@@ -279,7 +279,7 @@ async def monitor_jobs(self) -> int:
         successful_jobs_count = 0
 
         for job in list(self.jobs):
-            if job.is_completed():
+            if self.system.is_job_completed(job):
                 await self.job_completion_callback(job)
 
                 if self.mode == "dry-run":
@@ -322,7 +322,7 @@ def get_job_status(self, job: BaseJob) -> JobStatusResult:
         Returns:
             JobStatusResult: The result containing the job status and an optional error message.
         """
-        return job.test_run.test.test_template.get_job_status(job.output_path)
+        return job.test_run.test.test_template.get_job_status(job.test_run.output_path)
 
     async def handle_job_completion(self, completed_job: BaseJob):
         """
@@ -335,7 +335,7 @@ async def handle_job_completion(self, completed_job: BaseJob):
 
         self.jobs.remove(completed_job)
         del self.testrun_to_job_map[completed_job.test_run]
-        completed_job.increment_iteration()
+        completed_job.test_run.current_iteration += 1
         if not completed_job.terminated_by_dependency and completed_job.test_run.has_more_iterations():
             msg = f"Re-running job for iteration {completed_job.test_run.current_iteration}"
             logging.info(msg)
diff --git a/src/cloudai/runner/kubernetes/kubernetes_job.py b/src/cloudai/runner/kubernetes/kubernetes_job.py
@@ -15,42 +15,14 @@
 # limitations under the License.
 
 
-from cloudai import BaseJob, System, TestRun
+from dataclasses import dataclass
 
+from cloudai import BaseJob
 
-class KubernetesJob(BaseJob):
-    """
-    A job class for execution on a Kubernetes system.
-
-    Attributes
-        mode (str): The mode of the job (e.g., 'run', 'dry-run').
-        system (System): The system in which the job is running.
-        test_run (TestRun): The test instance associated with this job.
-        name (str): The name of the job.
-        kind (str): The kind of the job.
-    """
-
-    def __init__(self, mode: str, system: System, test_run: TestRun, name: str, kind: str):
-        """
-        Initialize a KubernetesJob instance.
 
-        Args:
-            mode (str): The mode of the job (e.g., 'run', 'dry-run').
-            system (System): The system in which the job is running.
-            test_run (TestRun): The test instance associated with this job.
-            name (str): The name of the job.
-            kind (str): The kind of the job.
-        """
-        super().__init__(mode, system, test_run)
-        self.id = name
-        self.name = name
-        self.kind = kind
-
-    def __repr__(self) -> str:
-        """
-        Return a string representation of the KubernetesJob instance.
+@dataclass
+class KubernetesJob(BaseJob):
+    """A job class for execution on a Kubernetes system."""
 
-        Returns
-            str: String representation of the job.
-        """
-        return f"KubernetesJob(name={self.name}, test={self.test_run.test.name}, " f"kind={self.kind})"
+    kind: str
+    name: str
diff --git a/src/cloudai/runner/kubernetes/kubernetes_runner.py b/src/cloudai/runner/kubernetes/kubernetes_runner.py
@@ -47,7 +47,7 @@ def _submit_test(self, tr: TestRun) -> KubernetesJob:
             k8s_system: KubernetesSystem = cast(KubernetesSystem, self.system)
             job_name = k8s_system.create_job(job_spec)
 
-        return KubernetesJob(self.mode, self.system, tr, job_name, job_kind)
+        return KubernetesJob(tr, id=job_name, name=job_name, kind=job_kind)
 
     async def job_completion_callback(self, job: BaseJob) -> None:
         """
@@ -58,7 +58,7 @@ async def job_completion_callback(self, job: BaseJob) -> None:
         """
         k8s_system: KubernetesSystem = cast(KubernetesSystem, self.system)
         k_job = cast(KubernetesJob, job)
-        k8s_system.store_logs_for_job(k_job.name, k_job.output_path)
+        k8s_system.store_logs_for_job(k_job.name, k_job.test_run.output_path)
         k8s_system.delete_job(k_job.name, k_job.kind)
 
     def kill_job(self, job: BaseJob) -> None:
@@ -70,5 +70,5 @@ def kill_job(self, job: BaseJob) -> None:
         """
         k8s_system: KubernetesSystem = cast(KubernetesSystem, self.system)
         k_job = cast(KubernetesJob, job)
-        k8s_system.store_logs_for_job(k_job.name, k_job.output_path)
+        k8s_system.store_logs_for_job(k_job.name, k_job.test_run.output_path)
         k8s_system.delete_job(k_job.name, k_job.kind)
diff --git a/src/cloudai/runner/slurm/slurm_job.py b/src/cloudai/runner/slurm/slurm_job.py
@@ -14,28 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Union
+from dataclasses import dataclass
 
-from cloudai import BaseJob, System, TestRun
+from cloudai import BaseJob
 
 
+@dataclass
 class SlurmJob(BaseJob):
-    """
-    A job class for execution on a Slurm system.
+    """A job class for execution on a Slurm system."""
 
-    Attributes
-        id (Union[str, int]): The unique identifier of the job.
-    """
-
-    def __init__(self, mode: str, system: System, test_run: TestRun, job_id: Union[str, int]):
-        BaseJob.__init__(self, mode, system, test_run)
-        self.id = job_id
-
-    def __repr__(self) -> str:
-        """
-        Return a string representation of the SlurmJob instance.
-
-        Returns
-            str: String representation of the job.
-        """
-        return f"SlurmJob(id={self.id}, test={self.test_run.test.name})"
+    pass
diff --git a/src/cloudai/runner/slurm/slurm_runner.py b/src/cloudai/runner/slurm/slurm_runner.py
@@ -68,5 +68,5 @@ def _submit_test(self, tr: TestRun) -> SlurmJob:
                     stderr=stderr,
                     message="Failed to retrieve job ID from command output.",
                 )
-            logging.info(f"Submitted slurm job: {job_id}")
-        return SlurmJob(self.mode, self.system, tr, job_id)
+        logging.info(f"Submitted slurm job: {job_id}")
+        return SlurmJob(tr, id=job_id)
diff --git a/src/cloudai/runner/standalone/standalone_job.py b/src/cloudai/runner/standalone/standalone_job.py
@@ -14,28 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Union
+from dataclasses import dataclass
 
-from cloudai import BaseJob, System, TestRun
+from cloudai import BaseJob
 
 
+@dataclass
 class StandaloneJob(BaseJob):
-    """
-    A job class for standalone execution.
+    """A job class for standalone execution."""
 
-    Attributes
-        id (Union[str, int]): The unique identifier of the job.
-    """
-
-    def __init__(self, mode: str, system: System, test_run: TestRun, job_id: Union[str, int]):
-        BaseJob.__init__(self, mode, system, test_run)
-        self.id = job_id
-
-    def __repr__(self) -> str:
-        """
-        Return a string representation of the StandaloneJob instance.
-
-        Returns
-            str: String representation of the job.
-        """
-        return f"StandaloneJob(id={self.id}, test={self.test_run.test.name})"
+    pass
diff --git a/src/cloudai/runner/standalone/standalone_runner.py b/src/cloudai/runner/standalone/standalone_runner.py
@@ -68,4 +68,4 @@ def _submit_test(self, tr: TestRun) -> StandaloneJob:
                     stderr="",
                     message="Failed to retrieve job ID from command output.",
                 )
-        return StandaloneJob(self.mode, self.system, tr, job_id)
+        return StandaloneJob(tr, id=job_id)
diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py
@@ -69,7 +69,7 @@ def gen_exec_command(self, tr: TestRun) -> str:
         self.final_cmd_args.update(
             {
                 "base_results_dir": str(tr.output_path.absolute()),
-                "launcher_scripts_path": str((repo_path / tdef.cmd_args.launcher_script).parent),
+                "launcher_scripts_path": str((repo_path / tdef.cmd_args.launcher_script).parent.absolute()),
             }
         )
 
diff --git a/tests/slurm_command_gen_strategy/test_nemo_launcher_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_nemo_launcher_slurm_command_gen_strategy.py
@@ -212,6 +212,10 @@ def test_log_command_to_file(
         test_run.output_path = tmp_path / "output_dir"
         test_run.output_path.mkdir()
 
+        repo_path = (tmp_path / "repo").relative_to(tmp_path)
+        tdef: NeMoLauncherTestDefinition = cast(NeMoLauncherTestDefinition, test_run.test.test_definition)
+        tdef.python_executable.git_repo.installed_path = repo_path
+        tdef.python_executable.venv_path = repo_path.parent / f"{repo_path.name}-venv"
         cmd_gen_strategy.gen_exec_command(test_run)
 
         written_content = mock_file().write.call_args[0][0]
@@ -221,6 +225,11 @@ def test_log_command_to_file(
         assert "TEST_VAR_1=value1" in written_content, "Logged command should contain environment variables"
         assert "training.trainer.num_nodes=2" in written_content, "Command should contain the number of nodes"
 
+        assert str((tdef.python_executable.venv_path / "bin" / "python").absolute()) in written_content
+        assert (
+            f"launcher_scripts_path={(repo_path / tdef.cmd_args.launcher_script).parent.absolute()} " in written_content
+        )
+
     def test_no_line_breaks_in_executed_command(
         self, cmd_gen_strategy: NeMoLauncherSlurmCommandGenStrategy, test_run: TestRun, tmp_path: Path
     ) -> None:
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
@@ -70,7 +70,11 @@ def test_slurm(tmp_path: Path, scenario: Dict):
         test_scenario=test_scenario_path,
         output_dir=tmp_path,
     )
-    with patch("asyncio.sleep", return_value=None):
+    with (
+        patch("asyncio.sleep", return_value=None),
+        patch("cloudai.systems.slurm.SlurmSystem.is_job_completed", return_value=True),
+        patch("cloudai.systems.slurm.SlurmSystem.is_job_running", return_value=True),
+    ):
         handle_dry_run_and_run(args)
 
     # Find the directory that was created for the test results
diff --git a/tests/test_standalone_system.py b/tests/test_standalone_system.py
@@ -62,7 +62,7 @@ def standalone_job(standalone_system, mock_test):
     Returns:
         StandaloneJob: A new instance of StandaloneJob for testing.
     """
-    return StandaloneJob("run", standalone_system, mock_test, 12345)
+    return StandaloneJob(mock_test, id=12345)
 
 
 @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -68,5 +68,5 @@ def _submit_test(self, tr: TestRun) -> SlurmJob:`
`68`	`68`	`stderr=stderr,`
`69`	`69`	`message="Failed to retrieve job ID from command output.",`
`70`	`70`	`)`
`71`		`- logging.info(f"Submitted slurm job: {job_id}")`
`72`		`- return SlurmJob(self.mode, self.system, tr, job_id)`
	`71`	`+ logging.info(f"Submitted slurm job: {job_id}")`
	`72`	`+ return SlurmJob(tr, id=job_id)`
Original file line number	Diff line number	Diff line change
`@@ -68,4 +68,4 @@ def _submit_test(self, tr: TestRun) -> StandaloneJob:`
`68`	`68`	`stderr="",`
`69`	`69`	`message="Failed to retrieve job ID from command output.",`
`70`	`70`	`)`
`71`		`- return StandaloneJob(self.mode, self.system, tr, job_id)`
	`71`	`+ return StandaloneJob(tr, id=job_id)`
Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def gen_exec_command(self, tr: TestRun) -> str:`
`69`	`69`	`self.final_cmd_args.update(`
`70`	`70`	`{`
`71`	`71`	`"base_results_dir": str(tr.output_path.absolute()),`
`72`		`- "launcher_scripts_path": str((repo_path / tdef.cmd_args.launcher_script).parent),`
	`72`	`+ "launcher_scripts_path": str((repo_path / tdef.cmd_args.launcher_script).parent.absolute()),`
`73`	`73`	`}`
`74`	`74`	`)`
`75`	`75`