Skip to content

Commit b52cd2b

Browse files
authored
Merge pull request #638 from NVIDIA/am/slurm-container-upd
Updates for SlurmContainer workload
2 parents d76029c + 7f4f0ee commit b52cd2b

File tree

5 files changed

+32
-73
lines changed

5 files changed

+32
-73
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "slurm-container"
18+
19+
[[Tests]]
20+
id = "nccl.alltoall"
21+
num_nodes = 2
22+
time_limit = "00:20:00"
23+
24+
name = "nccl-alltoall"
25+
description = "NCCL alltoall via SlurmContainer"
26+
test_template_name = "SlurmContainer"
27+
28+
[Tests.cmd_args]
29+
docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
30+
cmd = "alltoall_perf_mpi --nthreads 1 --ngpus 1 --minbytes 128 --maxbytes 4G --stepbytes 1M --op sum --datatype float --root 0 --iters 100 --warmup_iters 50 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 --stepfactor 2"

src/cloudai/registration.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,7 @@ def register_all():
9595
SleepStandaloneCommandGenStrategy,
9696
SleepTestDefinition,
9797
)
98-
from cloudai.workloads.slurm_container import (
99-
SlurmContainerCommandGenStrategy,
100-
SlurmContainerReportGenerationStrategy,
101-
SlurmContainerTestDefinition,
102-
)
98+
from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition
10399
from cloudai.workloads.triton_inference import (
104100
TritonInferenceReportGenerationStrategy,
105101
TritonInferenceSlurmCommandGenStrategy,
@@ -201,7 +197,6 @@ def register_all():
201197
Registry().add_report(NeMoRunTestDefinition, NeMoRunReportGenerationStrategy)
202198
Registry().add_report(NeMoRunTestDefinition, NeMoRunDataStoreReportGenerationStrategy)
203199
Registry().add_report(NemotronTestDefinition, JaxToolboxReportGenerationStrategy)
204-
Registry().add_report(SlurmContainerTestDefinition, SlurmContainerReportGenerationStrategy)
205200
Registry().add_report(UCCTestDefinition, UCCTestReportGenerationStrategy)
206201
Registry().add_report(TritonInferenceTestDefinition, TritonInferenceReportGenerationStrategy)
207202
Registry().add_report(NIXLBenchTestDefinition, NIXLBenchReportGenerationStrategy)

src/cloudai/workloads/slurm_container/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,11 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
from .report_generation_strategy import SlurmContainerReportGenerationStrategy
1817
from .slurm_command_gen_strategy import SlurmContainerCommandGenStrategy
1918
from .slurm_container import SlurmContainerCmdArgs, SlurmContainerTestDefinition
2019

2120
__all__ = [
2221
"SlurmContainerCmdArgs",
2322
"SlurmContainerCommandGenStrategy",
24-
"SlurmContainerReportGenerationStrategy",
2523
"SlurmContainerTestDefinition",
2624
]

src/cloudai/workloads/slurm_container/report_generation_strategy.py

Lines changed: 0 additions & 62 deletions
This file was deleted.

tests/test_test_scenario.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@
6565
NeMoRunTestDefinition,
6666
)
6767
from cloudai.workloads.nixl_bench import NIXLBenchReportGenerationStrategy, NIXLBenchTestDefinition
68-
from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
6968
from cloudai.workloads.triton_inference import TritonInferenceReportGenerationStrategy, TritonInferenceTestDefinition
7069
from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy
7170

@@ -481,7 +480,7 @@ def test_default(self):
481480
assert len(reporters) == 0
482481

483482
def test_default_reporters_size(self):
484-
assert len(Registry().reports_map) == 13
483+
assert len(Registry().reports_map) == 12
485484

486485
@pytest.mark.parametrize(
487486
"tdef,expected_reporters",
@@ -494,7 +493,6 @@ def test_default_reporters_size(self):
494493
(NeMoLauncherTestDefinition, {NeMoLauncherReportGenerationStrategy}),
495494
(NeMoRunTestDefinition, {NeMoRunReportGenerationStrategy, NeMoRunDataStoreReportGenerationStrategy}),
496495
(NemotronTestDefinition, {JaxToolboxReportGenerationStrategy}),
497-
(SlurmContainerTestDefinition, {SlurmContainerReportGenerationStrategy}),
498496
(UCCTestDefinition, {UCCTestReportGenerationStrategy}),
499497
(TritonInferenceTestDefinition, {TritonInferenceReportGenerationStrategy}),
500498
(NIXLBenchTestDefinition, {NIXLBenchReportGenerationStrategy}),

0 commit comments

Comments
 (0)