Skip to content

Commit ff3c491

Browse files
authored
Merge pull request #590 from NVIDIA/am/ucc-upd
Update UCC configs
2 parents 2ca7135 + 2f148dd commit ff3c491

File tree

10 files changed

+38
-81
lines changed

10 files changed

+38
-81
lines changed
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,9 +14,9 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
name = "ucc_test_allgather"
18-
description = "allgather"
17+
name = "ucc_base_test"
18+
description = "Base config for UCC tests"
1919
test_template_name = "UCCTest"
2020

2121
[cmd_args]
22-
"collective" = "allgather"
22+
docker_image_url = "nvcr.io/nvidia/pytorch:25.06-py3"

conf/common/test/ucc_test_allreduce.toml

Lines changed: 0 additions & 23 deletions
This file was deleted.

conf/common/test/ucc_test_alltoall.toml

Lines changed: 0 additions & 22 deletions
This file was deleted.

conf/common/test/ucc_test_reduce_scatter.toml

Lines changed: 0 additions & 22 deletions
This file was deleted.

conf/common/test_scenario/ucc_test.toml

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,33 +18,46 @@ name = "ucc_test"
1818

1919
[[Tests]]
2020
id = "Tests.alltoall"
21-
test_name = "ucc_test_alltoall"
21+
test_name = "ucc_base_test"
22+
description = "UCC alltoall"
2223
time_limit = "00:20:00"
2324
num_nodes = 2
25+
[Tests.cmd_args]
26+
collective = "alltoall"
2427

2528
[[Tests]]
2629
id = "Tests.allgather"
27-
test_name = "ucc_test_allgather"
30+
test_name = "ucc_base_test"
31+
description = "UCC allgather"
2832
time_limit = "00:20:00"
2933
num_nodes = 2
34+
[Tests.cmd_args]
35+
collective = "allgather"
3036
[[Tests.dependencies]]
3137
type = "start_post_comp"
3238
id = "Tests.alltoall"
3339

3440
[[Tests]]
3541
id = "Tests.allreduce"
36-
test_name = "ucc_test_allreduce"
42+
test_name = "ucc_base_test"
43+
description = "UCC allreduce"
3744
time_limit = "00:20:00"
3845
num_nodes = 2
46+
[Tests.cmd_args]
47+
collective = "allreduce"
48+
e = "4G"
3949
[[Tests.dependencies]]
4050
type = "start_post_comp"
4151
id = "Tests.allgather"
4252

4353
[[Tests]]
4454
id = "Tests.reduce_scatter"
45-
test_name = "ucc_test_reduce_scatter"
55+
test_name = "ucc_base_test"
56+
description = "UCC reduce_scatter"
4657
time_limit = "00:20:00"
4758
num_nodes = 2
59+
[Tests.cmd_args]
60+
collective = "reduce_scatter"
4861
[[Tests.dependencies]]
4962
type = "start_post_comp"
5063
id = "Tests.allreduce"

src/cloudai/workloads/ucc_test/ucc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
class UCCCmdArgs(CmdArgs):
2424
"""UCC test command arguments."""
2525

26-
docker_image_url: str = "nvcr.io/nvidia/pytorch:24.02-py3"
26+
docker_image_url: str
2727
collective: Union[
2828
Literal[
2929
"allgather",

tests/report_generation_strategy/test_ucc_report_generation_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def ucc_tr(slurm_system: SlurmSystem) -> TestRun:
7272
name="ucc_test",
7373
description="ucc_test",
7474
test_template_name="ucc_test",
75-
cmd_args=UCCCmdArgs(),
75+
cmd_args=UCCCmdArgs(docker_image_url="url://fake/ucc"),
7676
),
7777
test_template=TestTemplate(system=slurm_system),
7878
),

tests/slurm_command_gen_strategy/test_ucc_slurm_command_gen_strategy.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> UCCTestSlurmCommandGenS
3333
"cmd_args_data, extra_cmd_args, expected_command",
3434
[
3535
(
36-
{"collective": "allgather", "b": 8, "e": "256M"},
36+
{"collective": "allgather", "b": 8, "e": "256M", "docker_image_url": "url://fake/ucc"},
3737
{"--max-steps": "100"},
3838
[
3939
"/opt/hpcx/ucc/bin/ucc_perftest",
@@ -46,7 +46,7 @@ def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> UCCTestSlurmCommandGenS
4646
],
4747
),
4848
(
49-
{"collective": "allreduce", "b": 4, "e": "8M"},
49+
{"collective": "allreduce", "b": 4, "e": "8M", "docker_image_url": "url://fake/ucc"},
5050
{},
5151
[
5252
"/opt/hpcx/ucc/bin/ucc_perftest",
@@ -68,6 +68,7 @@ def test_generate_test_command(
6868
expected_command: list[str],
6969
) -> None:
7070
ucc_cmd_args = UCCCmdArgs(
71+
docker_image_url=cmd_args_data["docker_image_url"],
7172
collective=cmd_args_data["collective"],
7273
b=cmd_args_data["b"],
7374
e=cmd_args_data.get("e", "8M"),

tests/test_acceptance.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,12 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
278278
partial_tr,
279279
slurm_system,
280280
"ucc",
281-
UCCTestDefinition(name="ucc", description="ucc", test_template_name="ucc", cmd_args=UCCCmdArgs()),
281+
UCCTestDefinition(
282+
name="ucc",
283+
description="ucc",
284+
test_template_name="ucc",
285+
cmd_args=UCCCmdArgs(docker_image_url="nvcr.io/nvidia/pytorch:24.02-py3"),
286+
),
282287
UCCTestSlurmCommandGenStrategy,
283288
),
284289
"nccl": lambda: create_test_run(

tests/test_test_definitions.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,12 @@ def test_chakra_docker_image_is_required():
104104
@pytest.mark.parametrize(
105105
"test",
106106
[
107-
UCCTestDefinition(name="ucc", description="desc", test_template_name="ucc", cmd_args=UCCCmdArgs()),
107+
UCCTestDefinition(
108+
name="ucc",
109+
description="desc",
110+
test_template_name="ucc",
111+
cmd_args=UCCCmdArgs(docker_image_url="fake://url/ucc"),
112+
),
108113
NCCLTestDefinition(name="nccl", description="desc", test_template_name="nccl", cmd_args=NCCLCmdArgs()),
109114
GPTTestDefinition(
110115
name="gpt",

0 commit comments

Comments
 (0)