Merge pull request #687 from NVIDIA/am/bug-4545846

amaslenn · web-flow · commit c2f709fb706a · 2025-09-09T16:59:00.000+02:00
Explicitly forbid dependencies for scenarios with DSE jobs
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
@@ -115,24 +115,34 @@ def prepare_installation(
     return installables, installer
 
 
-def handle_dse_job(runner: Runner, args: argparse.Namespace):
+def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int:
     registry = Registry()
 
     original_test_runs = copy.deepcopy(runner.runner.test_scenario.test_runs)
 
+    has_dependencies = any(tr.dependencies for tr in runner.runner.test_scenario.test_runs)
+    if has_dependencies:
+        logging.error(
+            "Dependencies are not supported for DSE jobs, all cases run consecutively. "
+            "Please remove dependencies and re-run."
+        )
+        return 1
+
+    err = 0
     for tr in runner.runner.test_scenario.test_runs:
         test_run = copy.deepcopy(tr)
-        env = CloudAIGymEnv(test_run=test_run, runner=runner.runner)
-        agent_type = test_run.test.test_definition.agent
 
+        agent_type = test_run.test.test_definition.agent
         agent_class = registry.agents_map.get(agent_type)
         if agent_class is None:
             logging.error(
                 f"No agent available for type: {agent_type}. Please make sure {agent_type} "
                 f"is a valid agent type. Available agents: {registry.agents_map.keys()}"
             )
+            err = 1
             continue
 
+        env = CloudAIGymEnv(test_run=test_run, runner=runner.runner)
         agent = agent_class(env)
         for step in range(agent.max_steps):
             result = agent.select_action()
@@ -150,6 +160,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace):
         generate_reports(runner.runner.system, runner.runner.test_scenario, runner.runner.scenario_root)
 
     logging.info("All jobs are complete.")
+    return err
 
 
 def generate_reports(system: System, test_scenario: TestScenario, result_dir: Path) -> None:
@@ -291,8 +302,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         return 0
 
     if all(tr.is_dse_job for tr in test_scenario.test_runs):
-        handle_dse_job(runner, args)
-        return 0
+        return handle_dse_job(runner, args)
 
     logging.error("Mixing DSE and non-DSE jobs is not allowed.")
     return 1
diff --git a/tests/test_handlers.py b/tests/test_handlers.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import pytest
+
+from cloudai.cli.handlers import handle_dse_job
+from cloudai.core import Runner, TestDependency, TestRun, TestScenario
+from cloudai.systems.slurm.slurm_system import SlurmSystem
+
+
+@pytest.mark.parametrize("dep", ["start_post_comp", "start_post_init", "end_post_comp"])
+def test_dse_run_does_not_support_dependencies(
+    slurm_system: SlurmSystem, dse_tr: TestRun, dep: str, caplog: pytest.LogCaptureFixture
+) -> None:
+    """
+    DSE runs do not support dependencies.
+
+    DSE engine re-uses BaseRunner by manually controlling test_run to execute. BaseRunner doesn't keep track of all jobs
+    and their statuses, this information is not available between cases in a scenario or even between steps of a single
+    test run.
+
+    While it might be useful in future, today we have to explicitly forbid such configurations and report actionable
+    error to users.
+    """
+    dse_tr.dependencies = {dep: TestDependency(test_run=dse_tr)}
+    test_scenario: TestScenario = TestScenario(name="test_scenario", test_runs=[dse_tr])
+    runner = Runner(mode="dry-run", system=slurm_system, test_scenario=test_scenario)
+    assert handle_dse_job(runner, argparse.Namespace(mode="dry-run")) == 1
+    assert "Dependencies are not supported for DSE jobs, all cases run consecutively." in caplog.text
+    assert "Please remove dependencies and re-run." in caplog.text