Merge pull request #725 from NVIDIA/am/bug-4744288

amaslenn · web-flow · commit d3763fe30224 · 2025-11-24T14:19:26.000+01:00
Fixed and issue when using dependencies could result in an infinite loop
diff --git a/src/cloudai/_core/base_runner.py b/src/cloudai/_core/base_runner.py
@@ -348,15 +348,15 @@ async def handle_dependencies(self, completed_job: BaseJob) -> List[Task]:
         for tr in self.test_scenario.test_runs:
             if tr not in self.testrun_to_job_map:
                 for dep_type, dep in tr.dependencies.items():
-                    if dep_type == "start_post_comp" and dep.test_run.test == completed_job.test_run.test:
+                    if dep_type == "start_post_comp" and dep.test_run == completed_job.test_run:
                         task = await self.delayed_submit_test(tr)
                         if task:
                             tasks.append(task)
 
         # Handling end_post_comp dependencies
         for test, dependent_job in self.testrun_to_job_map.items():
             for dep_type, dep in test.dependencies.items():
-                if dep_type == "end_post_comp" and dep.test_run.test == completed_job.test_run.test:
+                if dep_type == "end_post_comp" and dep.test_run == completed_job.test_run:
                     task = await self.delayed_kill_job(dependent_job)
                     tasks.append(task)
 
diff --git a/tests/test_base_runner.py b/tests/test_base_runner.py
@@ -14,14 +14,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import asyncio
+from copy import deepcopy
 from pathlib import Path
 from typing import cast
 
 import pytest
 from pydantic import ConfigDict
 
-from cloudai._core.system import System
-from cloudai.core import BaseJob, BaseRunner, JobStatusResult, TestDefinition, TestRun, TestScenario
+from cloudai.core import (
+    BaseJob,
+    BaseRunner,
+    JobStatusResult,
+    System,
+    TestDefinition,
+    TestDependency,
+    TestRun,
+    TestScenario,
+)
 from cloudai.models.workload import CmdArgs
 from cloudai.systems.slurm import SlurmSystem
 
@@ -30,13 +40,23 @@ class MyRunner(BaseRunner):
     def __init__(self, mode: str, system: System, test_scenario: TestScenario, output_path: Path):
         super().__init__(mode, system, test_scenario, output_path)
         self.runner_job_status_result = JobStatusResult(is_successful=True)
+        self.submitted_trs: list[TestRun] = []
+        self.killed_by_dependency: list[BaseJob] = []
 
     def get_runner_job_status(self, job: BaseJob) -> JobStatusResult:
         return self.runner_job_status_result
 
     def _submit_test(self, tr: TestRun) -> BaseJob:
+        self.submitted_trs.append(tr)
         return BaseJob(tr, 0)
 
+    async def delayed_submit_test(self, tr: TestRun, delay: int = 0):
+        await super().delayed_submit_test(tr, 0)
+
+    async def delayed_kill_job(self, job: BaseJob, delay: int = 0):
+        self.killed_by_dependency.append(job)
+        await asyncio.sleep(0)
+
 
 class MyWorkload(TestDefinition):
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -93,3 +113,54 @@ def test_both_failed_runner_status_reported(self, runner: MyRunner):
         runner.runner_job_status_result = JobStatusResult(is_successful=False, error_message="runner job failed")
         res = runner.get_job_status(job)
         assert res == runner.runner_job_status_result
+
+
+class TestHandleDependencies:
+    """
+    Tests for BaseRunner.handle_dependencies method.
+
+    Both main and dependent TestRuns use the same MyWorkload test definition to reproduce an issue when jobs
+    comparison was done incorrectly.
+    """
+
+    @pytest.fixture
+    def tr_main(self, runner: MyRunner) -> TestRun:
+        return runner.test_scenario.test_runs[0]
+
+    @pytest.mark.asyncio
+    async def test_no_dependencies(self, runner: MyRunner, tr_main: TestRun):
+        await runner.handle_dependencies(BaseJob(tr_main, 0))
+        assert len(runner.submitted_trs) == 0
+
+    @pytest.mark.asyncio
+    async def test_start_post_comp(self, runner: MyRunner, tr_main: TestRun):
+        tr_dep = deepcopy(tr_main)
+        tr_dep.dependencies = {"start_post_comp": TestDependency(tr_main)}
+        runner.test_scenario.test_runs.append(tr_dep)
+
+        await runner.handle_dependencies(BaseJob(tr_dep, 0))  # self, should not trigger anything
+        assert len(runner.submitted_trs) == 0
+
+        await runner.handle_dependencies(BaseJob(tr_main, 0))
+        assert len(runner.submitted_trs) == 1
+        assert runner.submitted_trs[0] == tr_dep
+
+    @pytest.mark.asyncio
+    async def test_end_post_comp(self, runner: MyRunner, tr_main: TestRun):
+        tr_dep = deepcopy(tr_main)
+        tr_dep.dependencies = {"end_post_comp": TestDependency(tr_main)}
+        runner.test_scenario.test_runs.append(tr_dep)
+
+        # self not running, main completed -> nothing to kill
+        await runner.handle_dependencies(BaseJob(tr_main, 0))
+        assert len(runner.killed_by_dependency) == 0
+
+        # self is running, main completed -> should kill
+        await runner.submit_test(tr_dep)
+
+        await runner.handle_dependencies(BaseJob(tr_dep, 0))  # self, should not kill
+        assert len(runner.killed_by_dependency) == 0
+
+        await runner.handle_dependencies(BaseJob(tr_main, 0))
+        assert len(runner.killed_by_dependency) == 1
+        assert runner.killed_by_dependency[0].test_run == tr_dep