4141 family : $(vars.deployment_name)-u22
4242 disk_size_gb : 200
4343 nccl_plugin_version : v1.0.4
44+ benchmark_dir : $(ghpc_stage("system_benchmarks"))
4445
4546deployment_groups :
4647- group : image-env
@@ -54,6 +55,7 @@ deployment_groups:
5455 install_ansible : true
5556 docker :
5657 enabled : true
58+ world_writable : true
5759 runners :
5860 - type : shell
5961 destination : prep-for-slurm-build.sh
@@ -108,10 +110,12 @@ deployment_groups:
108110 nvidia_packages:
109111 - nvidia-open-570
110112 - nvidia-utils-570
111- - cuda-toolkit-12-8
112113 - nvidia-container-toolkit
114+ - cuda-toolkit-12-8
113115 - datacenter-gpu-manager
116+ - libnvidia-cfg1-570-server
114117 - libnvidia-nscq-570
118+ - nvidia-compute-utils-570-server
115119 tasks:
116120 - name: Download NVIDIA repository package
117121 ansible.builtin.get_url:
@@ -131,12 +135,40 @@ deployment_groups:
131135 name: "{{ item }}"
132136 selection: hold
133137 loop: "{{ nvidia_packages }}"
138+ - name: Create nvidia-persistenced override directory
139+ ansible.builtin.file:
140+ path: /etc/systemd/system/nvidia-persistenced.service.d
141+ state: directory
142+ owner: root
143+ group: root
144+ mode: 0o755
145+ - name: Configure nvidia-persistenced override
146+ ansible.builtin.copy:
147+ dest: /etc/systemd/system/nvidia-persistenced.service.d/persistence_mode.conf
148+ owner: root
149+ group: root
150+ mode: 0o644
151+ content: |
152+ [Service]
153+ ExecStart=
154+ ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --verbose
155+ notify: Reload SystemD
156+ handlers:
157+ - name: Reload SystemD
158+ ansible.builtin.systemd:
159+ daemon_reload: true
134160 post_tasks:
135161 - name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes)
136162 ansible.builtin.service:
137163 name: nvidia-dcgm.service
138164 state: stopped
139165 enabled: false
166+ - name: Disable nvidia-persistenced SystemD unit (enable during boot on GPU nodes)
167+ ansible.builtin.service:
168+ name: nvidia-persistenced.service
169+ state: stopped
170+ enabled: false
171+
140172 - type : ansible-local
141173 destination : install_ibverbs_utils.yml
142174 content : |
@@ -150,6 +182,11 @@ deployment_groups:
150182 name:
151183 - ibverbs-utils
152184 state: present
185+ - type : data
186+ destination : /etc/enroot/enroot.conf
187+ content : |
188+ ENROOT_CONFIG_PATH ${HOME}/.enroot
189+
153190
154191 - group : image
155192 modules :
@@ -314,6 +351,7 @@ deployment_groups:
314351 vars:
315352 enable_ops_agent: true
316353 enable_nvidia_dcgm: true
354+ enable_nvidia_persistenced: true
317355 tasks:
318356 - name: Update Ops Agent configuration
319357 ansible.builtin.blockinfile:
@@ -348,6 +386,11 @@ deployment_groups:
348386 name: nvidia-dcgm.service
349387 state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}"
350388 enabled: "{{ enable_nvidia_dcgm }}"
389+ - name: Enable NVIDIA Persistence Daemon
390+ ansible.builtin.service:
391+ name: nvidia-persistenced.service
392+ state: "{{ 'started' if enable_nvidia_persistenced else 'stopped' }}"
393+ enabled: "{{ enable_nvidia_persistenced }}"
351394
352395 - id : a4high_nodeset
353396 source : community/modules/compute/schedmd-slurm-gcp-v6-nodeset
@@ -415,6 +458,12 @@ deployment_groups:
415458 mkdir -m 0755 -p "${SLURM_ROOT}/scripts"
416459 mkdir -p "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d"
417460 ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-${PARTITION_NAME}-epilog_slurmd.d/gpu-test.epilog_slurmd"
461+ - type : data
462+ destination : /opt/apps/system_benchmarks/run-nccl-tests-via-ramble.sh
463+ source : $(vars.benchmark_dir)/run-nccl-tests-via-ramble.sh
464+ - type : data
465+ destination : /opt/apps/system_benchmarks/README.md
466+ source : $(vars.benchmark_dir)/README.md
418467
419468 - id : slurm_controller
420469 source : community/modules/scheduler/schedmd-slurm-gcp-v6-controller
0 commit comments