Skip to content

Commit 855c8c0

Browse files
samskillmanRachaelSTamakloe
authored andcommitted
Merge pull request #3725 from samskillman/a4-nccl-tests
A4 nccl tests
1 parent 6ef461f commit 855c8c0

File tree

3 files changed

+336
-1
lines changed

3 files changed

+336
-1
lines changed

examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ vars:
4141
family: $(vars.deployment_name)-u22
4242
disk_size_gb: 200
4343
nccl_plugin_version: v1.0.4
44+
benchmark_dir: $(ghpc_stage("system_benchmarks"))
4445

4546
deployment_groups:
4647
- group: image-env
@@ -54,6 +55,7 @@ deployment_groups:
5455
install_ansible: true
5556
docker:
5657
enabled: true
58+
world_writable: true
5759
runners:
5860
- type: shell
5961
destination: prep-for-slurm-build.sh
@@ -108,10 +110,12 @@ deployment_groups:
108110
nvidia_packages:
109111
- nvidia-open-570
110112
- nvidia-utils-570
111-
- cuda-toolkit-12-8
112113
- nvidia-container-toolkit
114+
- cuda-toolkit-12-8
113115
- datacenter-gpu-manager
116+
- libnvidia-cfg1-570-server
114117
- libnvidia-nscq-570
118+
- nvidia-compute-utils-570-server
115119
tasks:
116120
- name: Download NVIDIA repository package
117121
ansible.builtin.get_url:
@@ -131,12 +135,40 @@ deployment_groups:
131135
name: "{{ item }}"
132136
selection: hold
133137
loop: "{{ nvidia_packages }}"
138+
- name: Create nvidia-persistenced override directory
139+
ansible.builtin.file:
140+
path: /etc/systemd/system/nvidia-persistenced.service.d
141+
state: directory
142+
owner: root
143+
group: root
144+
mode: 0o755
145+
- name: Configure nvidia-persistenced override
146+
ansible.builtin.copy:
147+
dest: /etc/systemd/system/nvidia-persistenced.service.d/persistence_mode.conf
148+
owner: root
149+
group: root
150+
mode: 0o644
151+
content: |
152+
[Service]
153+
ExecStart=
154+
ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --verbose
155+
notify: Reload SystemD
156+
handlers:
157+
- name: Reload SystemD
158+
ansible.builtin.systemd:
159+
daemon_reload: true
134160
post_tasks:
135161
- name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes)
136162
ansible.builtin.service:
137163
name: nvidia-dcgm.service
138164
state: stopped
139165
enabled: false
166+
- name: Disable nvidia-persistenced SystemD unit (enable during boot on GPU nodes)
167+
ansible.builtin.service:
168+
name: nvidia-persistenced.service
169+
state: stopped
170+
enabled: false
171+
140172
- type: ansible-local
141173
destination: install_ibverbs_utils.yml
142174
content: |
@@ -150,6 +182,11 @@ deployment_groups:
150182
name:
151183
- ibverbs-utils
152184
state: present
185+
- type: data
186+
destination: /etc/enroot/enroot.conf
187+
content: |
188+
ENROOT_CONFIG_PATH ${HOME}/.enroot
189+
153190
154191
- group: image
155192
modules:
@@ -314,6 +351,7 @@ deployment_groups:
314351
vars:
315352
enable_ops_agent: true
316353
enable_nvidia_dcgm: true
354+
enable_nvidia_persistenced: true
317355
tasks:
318356
- name: Update Ops Agent configuration
319357
ansible.builtin.blockinfile:
@@ -348,6 +386,11 @@ deployment_groups:
348386
name: nvidia-dcgm.service
349387
state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}"
350388
enabled: "{{ enable_nvidia_dcgm }}"
389+
- name: Enable NVIDIA Persistence Daemon
390+
ansible.builtin.service:
391+
name: nvidia-persistenced.service
392+
state: "{{ 'started' if enable_nvidia_persistenced else 'stopped' }}"
393+
enabled: "{{ enable_nvidia_persistenced }}"
351394
352395
- id: a4high_nodeset
353396
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
@@ -415,6 +458,12 @@ deployment_groups:
415458
mkdir -m 0755 -p "${SLURM_ROOT}/scripts"
416459
mkdir -p "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d"
417460
ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d/gpu-test.epilog_slurmd"
461+
- type: data
462+
destination: /opt/apps/system_benchmarks/run-nccl-tests-via-ramble.sh
463+
source: $(vars.benchmark_dir)/run-nccl-tests-via-ramble.sh
464+
- type: data
465+
destination: /opt/apps/system_benchmarks/README.md
466+
source: $(vars.benchmark_dir)/README.md
418467

419468
- id: slurm_controller
420469
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
Running System Benchmarks with Ramble
2+
=====================================
3+
4+
[Ramble](https://github.com/GoogleCloudPlatform/ramble) is an open source
5+
multi-platform experimentation framework written in python. It can be used
6+
to easily reproduce benchmark results across systems, and here
7+
we will use it to run a series of system benchmarks.
8+
9+
Currently the following benchmarks are supported:
10+
11+
* NCCL tests (all-gather, all-reduce, reduce-scatter)
12+
13+
The run scripts have all been staged into `/opt/apps/system_benchmarks`
14+
on the controller node (and available to all nodes). We recommend running
15+
them using `nohup` and redirecting the stdout/err to a logfile, as the tests can
16+
take 30-60 minutes or longer if other jobs are in the queue. The results can
17+
then be viewed by `tail`ing the log file.
18+
19+
For NCCL tests, run:
20+
21+
```bash
22+
nohup bash /opt/apps/system_benchmarks/run-nccl-tests-via-ramble.sh >& nccl-$(date -Iseconds).log &
23+
tail -f nccl-*.log
24+
```
25+
26+
For each benchmark, multiple node scales will be submitted, up to your maximum
27+
node scale of your cluster.
28+
29+
Viewing the Results
30+
-------------------
31+
32+
For nccl, at the end of the nccl-$(date -Iseconds).log,
33+
you should see something like:
34+
35+
```bash
36+
...
37+
---- SUMMARY for >1GB Message Sizes ----
38+
workload n_nodes msg_size busbw
39+
all-gather 2 1073741824 XXX.XX
40+
all-gather 2 2147483648 XXX.XX
41+
all-gather 2 4294967296 XXX.XX
42+
all-gather 2 8589934592 XXX.XX
43+
...
44+
all-reduce 2 1073741824 XXX.XX
45+
...
46+
reduce-scatter 2 1073741824 XXX.XX
47+
...
48+
49+
-------- Benchmarking Complete -------
50+
```
51+
52+
Cleaning Up
53+
-----------
54+
55+
The ramble workspaces will be located in directories called `nccl-tests-*`.
56+
The ramble codebase was installed to
57+
`/opt/apps/ramble`. Removing all of these directories will remove all of the
58+
files generated during these tests.

0 commit comments

Comments
 (0)