Skip to content

Commit fe75dea

Browse files
authored
Merge pull request #393 from NVIDIA/am/tdef-in-scenario
Control workload settings from scenario
2 parents c874795 + caa11ef commit fe75dea

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+760
-660
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "nccl-tests"
18+
19+
[[Tests]]
20+
id = "nccl.all_reduce"
21+
num_nodes = 2
22+
time_limit = "00:20:00"
23+
24+
name = "nccl-all_reduce"
25+
description = "all_reduce"
26+
test_template_name = "NcclTest"
27+
28+
[Tests.cmd_args]
29+
subtest_name = "all_reduce_perf_mpi"
30+
ngpus = 1
31+
minbytes = "128"
32+
maxbytes = "512"
33+
iters = 50
34+
warmup_iters = 10
35+
36+
[[Tests]]
37+
id = "nccl.all_gather"
38+
num_nodes = 2
39+
time_limit = "00:20:00"
40+
41+
name = "nccl-all_gather"
42+
description = "all_gather"
43+
test_template_name = "NcclTest"
44+
45+
[Tests.cmd_args]
46+
subtest_name = "all_gather_perf_mpi"
47+
ngpus = 1
48+
minbytes = "128"
49+
maxbytes = "512"
50+
iters = 50
51+
warmup_iters = 10

conf/experimental/test/nemo_launcher_nemotron_15b_bf16_16_node.toml

Lines changed: 0 additions & 55 deletions
This file was deleted.

conf/experimental/test/nemo_launcher_nemotron_15b_bf16_32_node.toml

Lines changed: 0 additions & 55 deletions
This file was deleted.

conf/experimental/test/nemo_launcher_nemotron_15b_bf16_4_node.toml

Lines changed: 0 additions & 55 deletions
This file was deleted.

conf/experimental/test/nemo_launcher_nemotron_15b_bf16_64_node.toml

Lines changed: 0 additions & 55 deletions
This file was deleted.

conf/experimental/test/nemo_launcher_nemotron_15b_bf16_8_node.toml

Lines changed: 0 additions & 55 deletions
This file was deleted.

conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16.toml

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,25 +23,40 @@ num_nodes = "2"
2323

2424
[[Tests]]
2525
id = "nemo_launcher_nemotron_15b_bf16_4_node"
26-
test_name = "nemo_launcher_nemotron_15b_bf16_4_node"
26+
test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
2727
num_nodes = "4"
2828

29+
[Tests.cmd_args.training.model]
30+
global_batch_size = 128
31+
2932
[[Tests]]
3033
id = "nemo_launcher_nemotron_15b_bf16_8_node"
31-
test_name = "nemo_launcher_nemotron_15b_bf16_8_node"
34+
test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
3235
num_nodes = "8"
3336

37+
[Tests.cmd_args.training.model]
38+
global_batch_size = 256
39+
3440
[[Tests]]
3541
id = "nemo_launcher_nemotron_15b_bf16_16_node"
36-
test_name = "nemo_launcher_nemotron_15b_bf16_16_node"
42+
test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
3743
num_nodes = "16"
3844

45+
[Tests.cmd_args.training.model]
46+
global_batch_size = 512
47+
3948
[[Tests]]
4049
id = "nemo_launcher_nemotron_15b_bf16_32_node"
41-
test_name = "nemo_launcher_nemotron_15b_bf16_32_node"
50+
test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
4251
num_nodes = "32"
4352

53+
[Tests.cmd_args.training.model]
54+
global_batch_size = 1024
55+
4456
[[Tests]]
4557
id = "nemo_launcher_nemotron_15b_bf16_64_node"
46-
test_name = "nemo_launcher_nemotron_15b_bf16_64_node"
58+
test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
4759
num_nodes = "64"
60+
61+
[Tests.cmd_args.training.model]
62+
global_batch_size = 2048

conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_16_node.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,8 @@ name = "nemo_launcher_nemotron_15b_bf16_16_node"
1818

1919
[[Tests]]
2020
id = "nemo_launcher_nemotron_15b_bf16_16_node"
21-
test_name = "nemo_launcher_nemotron_15b_bf16_16_node"
21+
test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
2222
num_nodes = "16"
23+
24+
[Tests.cmd_args.training.model]
25+
global_batch_size = 512

conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_32_node.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,8 @@ name = "nemo_launcher_nemotron_15b_bf16_32_node"
1818

1919
[[Tests]]
2020
id = "nemo_launcher_nemotron_15b_bf16_32_node"
21-
test_name = "nemo_launcher_nemotron_15b_bf16_32_node"
21+
test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
2222
num_nodes = "32"
23+
24+
[Tests.cmd_args.training.model]
25+
global_batch_size = 1024

0 commit comments

Comments
 (0)