Skip to content

Commit c2f709f

Browse files
authored
Merge pull request #687 from NVIDIA/am/bug-4545846
Explicitly forbid dependencies for scenarios with DSE jobs
2 parents a170baa + 4d26e62 commit c2f709f

File tree

2 files changed

+60
-5
lines changed

2 files changed

+60
-5
lines changed

src/cloudai/cli/handlers.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -115,24 +115,34 @@ def prepare_installation(
115115
return installables, installer
116116

117117

118-
def handle_dse_job(runner: Runner, args: argparse.Namespace):
118+
def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int:
119119
registry = Registry()
120120

121121
original_test_runs = copy.deepcopy(runner.runner.test_scenario.test_runs)
122122

123+
has_dependencies = any(tr.dependencies for tr in runner.runner.test_scenario.test_runs)
124+
if has_dependencies:
125+
logging.error(
126+
"Dependencies are not supported for DSE jobs, all cases run consecutively. "
127+
"Please remove dependencies and re-run."
128+
)
129+
return 1
130+
131+
err = 0
123132
for tr in runner.runner.test_scenario.test_runs:
124133
test_run = copy.deepcopy(tr)
125-
env = CloudAIGymEnv(test_run=test_run, runner=runner.runner)
126-
agent_type = test_run.test.test_definition.agent
127134

135+
agent_type = test_run.test.test_definition.agent
128136
agent_class = registry.agents_map.get(agent_type)
129137
if agent_class is None:
130138
logging.error(
131139
f"No agent available for type: {agent_type}. Please make sure {agent_type} "
132140
f"is a valid agent type. Available agents: {registry.agents_map.keys()}"
133141
)
142+
err = 1
134143
continue
135144

145+
env = CloudAIGymEnv(test_run=test_run, runner=runner.runner)
136146
agent = agent_class(env)
137147
for step in range(agent.max_steps):
138148
result = agent.select_action()
@@ -150,6 +160,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace):
150160
generate_reports(runner.runner.system, runner.runner.test_scenario, runner.runner.scenario_root)
151161

152162
logging.info("All jobs are complete.")
163+
return err
153164

154165

155166
def generate_reports(system: System, test_scenario: TestScenario, result_dir: Path) -> None:
@@ -291,8 +302,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
291302
return 0
292303

293304
if all(tr.is_dse_job for tr in test_scenario.test_runs):
294-
handle_dse_job(runner, args)
295-
return 0
305+
return handle_dse_job(runner, args)
296306

297307
logging.error("Mixing DSE and non-DSE jobs is not allowed.")
298308
return 1

tests/test_handlers.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import argparse
18+
19+
import pytest
20+
21+
from cloudai.cli.handlers import handle_dse_job
22+
from cloudai.core import Runner, TestDependency, TestRun, TestScenario
23+
from cloudai.systems.slurm.slurm_system import SlurmSystem
24+
25+
26+
@pytest.mark.parametrize("dep", ["start_post_comp", "start_post_init", "end_post_comp"])
27+
def test_dse_run_does_not_support_dependencies(
28+
slurm_system: SlurmSystem, dse_tr: TestRun, dep: str, caplog: pytest.LogCaptureFixture
29+
) -> None:
30+
"""
31+
DSE runs do not support dependencies.
32+
33+
DSE engine re-uses BaseRunner by manually controlling test_run to execute. BaseRunner doesn't keep track of all jobs
34+
and their statuses, this information is not available between cases in a scenario or even between steps of a single
35+
test run.
36+
37+
While it might be useful in future, today we have to explicitly forbid such configurations and report actionable
38+
error to users.
39+
"""
40+
dse_tr.dependencies = {dep: TestDependency(test_run=dse_tr)}
41+
test_scenario: TestScenario = TestScenario(name="test_scenario", test_runs=[dse_tr])
42+
runner = Runner(mode="dry-run", system=slurm_system, test_scenario=test_scenario)
43+
assert handle_dse_job(runner, argparse.Namespace(mode="dry-run")) == 1
44+
assert "Dependencies are not supported for DSE jobs, all cases run consecutively." in caplog.text
45+
assert "Please remove dependencies and re-run." in caplog.text

0 commit comments

Comments
 (0)