Skip to content

Commit 2a8f267

Browse files
Merge pull request #1309 from linsword13/update-env-vars
Update to use `environment_variable_names` option for various apps
2 parents 14e5217 + 889c80f commit 2a8f267

File tree

6 files changed

+32
-194
lines changed

6 files changed

+32
-194
lines changed

var/ramble/repos/builtin/applications/nccl-tests/application.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,16 +133,10 @@ class NcclTests(ExecutableApplication):
133133
default="",
134134
description='How NCCL communicators should be split, if at all. "0x7" for rail-aligned, "0x0" for world-level.',
135135
workloads=all_workloads,
136+
environment_variable_name="NCCL_TESTS_SPLIT_MASK",
136137
expandable=False,
137138
)
138139

139-
environment_variable(
140-
"NCCL_TESTS_SPLIT_MASK",
141-
"{nccl_tests_split_mask}",
142-
'How NCCL communicators should be split, if at all. "0x7" for rail-aligned, "0x0" for world-level.',
143-
workloads=all_workloads,
144-
)
145-
146140
# (output_name, units, group_name, regex)
147141
regex_parts = [
148142
("Size", "B", "size", "[0-9]+"),

var/ramble/repos/builtin/applications/nvidia-hpl/application.py

Lines changed: 18 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -52,243 +52,153 @@ class NvidiaHpl(HplBase, NvidiaHPCBase):
5252

5353
workload_variable(
5454
"hpl_fct_comm_policy",
55+
environment_variable_name="HPL_FCT_COMM_POLICY",
5556
default="1",
5657
values=["0", "1"],
5758
description="Which communication library to use in the panel factorization. 0 = NVSHMEM, 1 = Host MPI",
5859
workload_group="all_workloads",
5960
)
60-
environment_variable(
61-
"HPL_FCT_COMM_POLICY",
62-
"{hpl_fct_comm_policy}",
63-
description="",
64-
workload_group="all_workloads",
65-
)
6661

6762
workload_variable(
6863
"hpl_use_nvshmem",
64+
environment_variable_name="HPL_USE_NVSHMEM",
6965
default="0",
7066
values=["0", "1"],
7167
description="Whether to use NVSHMEM or not. 0 = Disable, 1 = Enable.",
7268
workload_group="all_workloads",
7369
)
74-
environment_variable(
75-
"HPL_USE_NVSHMEM",
76-
"{hpl_use_nvshmem}",
77-
description="Whether or not to use NVSHMEM",
78-
workload_group="all_workloads",
79-
)
8070

8171
workload_variable(
8272
"hpl_p2p_as_bcast",
73+
environment_variable_name="HPL_P2P_AS_BCAST",
8374
default="0",
8475
values=["0", "1", "2", "3", "4"],
8576
description="0 = ncclBcast, 1 = ncclSend/Recv, 2 = CUDA-aware MPI, 3 = host MPI, 4 = NVSHMEM",
8677
workload_group="all_workloads",
8778
)
88-
environment_variable(
89-
"HPL_P2P_AS_BCAST",
90-
"{hpl_p2p_as_bcast}",
91-
description="Which communication library to use in the final solve step.",
92-
workload_group="all_workloads",
93-
)
9479

9580
workload_variable(
9681
"hpl_nvshmem_swap",
82+
environment_variable_name="HPL_NVSHMEM_SWAP",
9783
default="0",
9884
values=["0", "1"],
9985
description="Performs row swaps using NVSHMEM instead of NCCL. 0 = Disable, 1 = Enable.",
10086
workload_group="all_workloads",
10187
)
102-
environment_variable(
103-
"HPL_NVSHMEM_SWAP",
104-
"{hpl_nvshmem_swap}",
105-
description="Performs row swaps using NVSHMEM instead of NCCL. 0 = Disable, 1 = Enable.",
106-
workload_group="all_workloads",
107-
)
10888

10989
workload_variable(
11090
"hpl_chunk_size_nbs",
91+
environment_variable_name="HPL_CHUNK_SIZE_NBS",
11192
default="16",
11293
description="Number of matrix blocks to group for computations. Needs to be > 0",
11394
workload_group="all_workloads",
11495
)
115-
environment_variable(
116-
"HPL_CHUNK_SIZE_NBS",
117-
"{hpl_chunk_size_nbs}",
118-
description="Number of matrix blocks to group for computations. Needs to be > 0",
119-
workload_group="all_workloads",
120-
)
12196

12297
workload_variable(
12398
"hpl_dist_trsm_flag",
99+
environment_variable_name="HPL_DIST_TRSM_FLAG",
124100
default="1",
125101
values=["0", "1"],
126102
description="Perform the solve step (TRSM) in parallel, rather than on only the ranks that own part of the matrix.",
127103
workload_group="all_workloads",
128104
)
129-
environment_variable(
130-
"HPL_DIST_TRSM_FLAG",
131-
"{hpl_dist_trsm_flag}",
132-
description="Perform the solve step (TRSM) in parallel, rather than on only the ranks that own part of the matrix.",
133-
workload_group="all_workloads",
134-
)
135105

136106
workload_variable(
137107
"hpl_cta_per_fct",
108+
environment_variable_name="HPL_CTA_PER_FCT",
138109
default="16",
139110
description="Sets the number of CTAs (thread blocks) for factorization. Needs to be > 0.",
140111
workload_group="all_workloads",
141112
)
142-
environment_variable(
143-
"HPL_CTA_PER_FCT",
144-
"{hpl_cta_per_fct}",
145-
description="Sets the number of CTAs (thread blocks) for factorization. Needs to be > 0.",
146-
workload_group="all_workloads",
147-
)
148113

149114
workload_variable(
150115
"hpl_alloc_hugepages",
116+
environment_variable_name="HPL_ALLOC_HUGEPAGES",
151117
default="0",
152118
values=["0", "1"],
153119
description="Use 2MB hugepages for host-side allocations. Done through the madvise syscall.",
154120
workload_group="all_workloads",
155121
)
156-
environment_variable(
157-
"HPL_ALLOC_HUGEPAGES",
158-
"{hpl_alloc_hugepages}",
159-
description="Use 2MB hugepages for host-side allocations. Done through the madvise syscall.",
160-
workload_group="all_workloads",
161-
)
162122

163123
workload_variable(
164124
"warmup_end_prog",
125+
environment_variable_name="WARMUP_END_PROG",
165126
default="5",
166127
description="Runs the main loop once before the 'real' run. Stops the warmup at x%. Values can be 1 - 100.",
167128
workload_group="all_workloads",
168129
)
169-
environment_variable(
170-
"WARMUP_END_PROG",
171-
"{warmup_end_prog}",
172-
description="Runs the main loop once before the 'real' run. Stops the warmup at x%. Values can be 1 - 100.",
173-
workload_group="all_workloads",
174-
)
175130

176131
workload_variable(
177132
"test_loops",
133+
environment_variable_name="TEST_LOOPS",
178134
default="1",
179135
description="Runs the main loop X many times",
180136
workload_group="all_workloads",
181137
)
182-
environment_variable(
183-
"TEST_LOOPS",
184-
"{test_loops}",
185-
description="Runs the main loop X many times",
186-
workload_group="all_workloads",
187-
)
188138

189139
workload_variable(
190140
"hpl_cusolver_mp_tests",
141+
environment_variable_name="HPL_CUSOLVER_MP_TESTS",
191142
default="1",
192143
description="Runs several tests of individual components of HPL (GEMMS, comms, etc.)",
193144
workload_group="all_workloads",
194145
)
195-
environment_variable(
196-
"HPL_CUSOLVER_MP_TESTS",
197-
"{hpl_cusolver_mp_tests}",
198-
description="Runs several tests of individual components of HPL (GEMMS, comms, etc.)",
199-
workload_group="all_workloads",
200-
)
201146

202147
workload_variable(
203148
"hpl_cusolver_mp_tests_gemm_iters",
149+
environment_variable_name="HPL_CUSOLVER_MP_TESTS_GEMM_ITERS",
204150
default="128",
205151
description="Number of repeat GEMM calls in tests. Needs to be > 0.",
206152
workload_group="all_workloads",
207153
)
208-
environment_variable(
209-
"HPL_CUSOLVER_MP_TESTS_GEMM_ITERS",
210-
"{hpl_cusolver_mp_tests_gemm_iters}",
211-
description="Number of repeat GEMM calls in tests. Needs to be > 0.",
212-
workload_group="all_workloads",
213-
)
214154

215155
workload_variable(
216156
"hpl_ooc_mode",
157+
environment_variable_name="HPL_OOC_MODE",
217158
default="0",
218159
description="Enables / disales out-of-core mode",
219160
workload_group="all_workloads",
220161
)
221-
environment_variable(
222-
"HPL_OOC_MODE",
223-
"{hpl_ooc_mode}",
224-
description="Enables / disales out-of-core mode",
225-
workload_group="all_workloads",
226-
)
227162

228163
workload_variable(
229164
"hpl_ooc_max_gpu_mem",
165+
environment_variable_name="HPL_OOC_MAX_GPU_MEM",
230166
default="-1",
231167
description="Limits the amount of GPU memory used for OOC. In GiB. Needs to be >= -1.",
232168
workload_group="all_workloads",
233169
)
234-
environment_variable(
235-
"HPL_OOC_MAX_GPU_MEM",
236-
"{hpl_ooc_max_gpu_mem}",
237-
description="Limits the amount of GPU memory used for OOC. In GiB. Needs to be >= -1.",
238-
workload_group="all_workloads",
239-
)
240170

241171
workload_variable(
242172
"hpl_ooc_tile_m",
173+
environment_variable_name="HPL_OOC_TILE_M",
243174
default="4096",
244175
description="Row blocking factor. Needs to be > 0",
245176
workload_group="all_workloads",
246177
)
247-
environment_variable(
248-
"HPL_OOC_TILE_M",
249-
"{hpl_ooc_tile_m}",
250-
description="Row blocking factor. Needs to be > 0",
251-
workload_group="all_workloads",
252-
)
253178

254179
workload_variable(
255180
"hpl_ooc_tile_n",
181+
environment_variable_name="HPL_OOC_TILE_N",
256182
default="4096",
257183
description="Column blocking factor. Needs to be > 0",
258184
workload_group="all_workloads",
259185
)
260-
environment_variable(
261-
"HPL_OOC_TILE_N",
262-
"{hpl_ooc_tile_n}",
263-
description="Column blocking factor. Needs to be > 0",
264-
workload_group="all_workloads",
265-
)
266186

267187
workload_variable(
268188
"hpl_ooc_num_streams",
189+
environment_variable_name="HPL_OOC_NUM_STREAMS",
269190
default="3",
270191
description="Number of streams used for OCC operations",
271192
workload_group="all_workloads",
272193
)
273-
environment_variable(
274-
"HPL_OOC_NUM_STREAMS",
275-
"{hpl_ooc_num_streams}",
276-
description="Number of streams used for OCC operations",
277-
workload_group="all_workloads",
278-
)
279194

280195
workload_variable(
281196
"hpl_ooc_safe_size",
197+
environment_variable_name="HPL_OOC_SAFE_SIZE",
282198
default="2.0",
283199
description="GPU memory (in GiB) needed for driver. This amount will not be used by HPL OCC",
284200
workload_group="all_workloads",
285201
)
286-
environment_variable(
287-
"HPL_OOC_SAFE_SIZE",
288-
"{hpl_ooc_safe_size}",
289-
description="GPU memory (in GiB) needed for driver. This amount will not be used by HPL OCC",
290-
workload_group="all_workloads",
291-
)
292202

293203
workload_variable(
294204
"block_size",

var/ramble/repos/builtin/applications/py-nemo-2/application.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,11 @@ class PyNemo2(BasePyNemo):
7777

7878
workload_variable(
7979
"results_mount",
80+
environment_variable_name="NEMO_CONTAINER_MOUNTS",
8081
default="{experiment_run_dir}:{experiment_run_dir}",
8182
description="Container mount for results data",
8283
workload_group="pretraining",
8384
)
84-
environment_variable(
85-
"NEMO_CONTAINER_MOUNTS",
86-
value="{results_mount}",
87-
description="All container mounts in an environment variable",
88-
workload_group="pretraining",
89-
)
9085
workload_variable(
9186
"container_mounts",
9287
default="{results_mount}",

var/ramble/repos/builtin/applications/wrfv4/application.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -197,15 +197,9 @@ class Wrfv4(ExecutableApplication):
197197
)
198198

199199
with when("+wrf_tiles"):
200-
environment_variable(
201-
"NUM_WRF_TILES",
202-
value="{num_tiles}",
203-
description="Number of tiles to use in WRF domain",
204-
workload_group="all_workloads",
205-
)
206-
207200
workload_variable(
208201
"num_tiles",
202+
environment_variable_name="NUM_WRF_TILES",
209203
default="1",
210204
description="Number of tiles to use in WRF domain",
211205
workload_group="all_workloads",

var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py

Lines changed: 6 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -50,78 +50,48 @@ class NvidiaHpcBenchmarks(ExecutableApplication):
5050

5151
workload_variable(
5252
"nvshmem_disable_cuda_vmm",
53+
environment_variable_name="NVSHMEM_DISABLE_CUDA_VMM",
5354
default="1",
5455
description="",
5556
workload_group="all_workloads",
5657
)
57-
environment_variable(
58-
"NVSHMEM_DISABLE_CUDA_VMM",
59-
"{nvshmem_disable_cuda_vmm}",
60-
description="",
61-
workload_group="all_workloads",
62-
)
6358

6459
workload_variable(
6560
"pmix_mca_gds",
61+
environment_variable_name="PMIX_MCA_gds",
6662
default="^ds12",
67-
description="",
68-
workload_group="all_workloads",
69-
)
70-
environment_variable(
71-
"PMIX_MCA_gds",
72-
"{pmix_mca_gds}",
7363
description="PMIX MCA gds",
7464
workload_group="all_workloads",
7565
)
7666

7767
workload_variable(
7868
"ompi_mca_btl",
69+
environment_variable_name="OMPI_MCA_btl",
7970
default="^vader,tcp,openib,uct",
80-
description="",
81-
workload_group="all_workloads",
82-
)
83-
environment_variable(
84-
"OMPI_MCA_btl",
85-
"{ompi_mca_btl}",
8671
description="OpenMPI MCA btl",
8772
workload_group="all_workloads",
8873
)
8974

9075
workload_variable(
9176
"ompi_mca_pml",
77+
environment_variable_name="OMPI_MCA_pml",
9278
default="ucx",
93-
description="",
94-
workload_group="all_workloads",
95-
)
96-
environment_variable(
97-
"OMPI_MCA_pml",
98-
"{ompi_mca_pml}",
9979
description="OpenMPI MCA pml",
10080
workload_group="all_workloads",
10181
)
10282

10383
workload_variable(
10484
"ucx_net_devices",
85+
environment_variable_name="UCX_NET_DEVICES",
10586
default="enp6s0,enp12s0,enp134s0,enp140s0",
106-
description="",
107-
workload_group="all_workloads",
108-
)
109-
environment_variable(
110-
"UCX_NET_DEVICES",
111-
"{ucx_net_devices}",
11287
description="UCX Net Devices",
11388
workload_group="all_workloads",
11489
)
11590

11691
workload_variable(
11792
"ucx_max_rndv_rails",
93+
environment_variable_name="UCX_MAX_RNDV_RAILS",
11894
default="4",
119-
description="",
120-
workload_group="all_workloads",
121-
)
122-
environment_variable(
123-
"UCX_MAX_RNDV_RAILS",
124-
"{ucx_max_rndv_rails}",
12595
description="UCX MAximum RNDV Rails",
12696
workload_group="all_workloads",
12797
)

0 commit comments

Comments
 (0)