@@ -113,9 +113,7 @@ deployment_groups:
113113 - nvidia-container-toolkit
114114 - cuda-toolkit-12-8
115115 - datacenter-gpu-manager
116- - libnvidia-cfg1-570-server
117116 - libnvidia-nscq-570
118- - nvidia-compute-utils-570-server
119117 tasks:
120118 - name: Download NVIDIA repository package
121119 ansible.builtin.get_url:
@@ -135,39 +133,12 @@ deployment_groups:
135133 name: "{{ item }}"
136134 selection: hold
137135 loop: "{{ nvidia_packages }}"
138- - name: Create nvidia-persistenced override directory
139- ansible.builtin.file:
140- path: /etc/systemd/system/nvidia-persistenced.service.d
141- state: directory
142- owner: root
143- group: root
144- mode: 0o755
145- - name: Configure nvidia-persistenced override
146- ansible.builtin.copy:
147- dest: /etc/systemd/system/nvidia-persistenced.service.d/persistence_mode.conf
148- owner: root
149- group: root
150- mode: 0o644
151- content: |
152- [Service]
153- ExecStart=
154- ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --verbose
155- notify: Reload SystemD
156- handlers:
157- - name: Reload SystemD
158- ansible.builtin.systemd:
159- daemon_reload: true
160136 post_tasks:
161137 - name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes)
162138 ansible.builtin.service:
163139 name: nvidia-dcgm.service
164140 state: stopped
165141 enabled: false
166- - name: Disable nvidia-persistenced SystemD unit (enable during boot on GPU nodes)
167- ansible.builtin.service:
168- name: nvidia-persistenced.service
169- state: stopped
170- enabled: false
171142
172143 - type : ansible-local
173144 destination : install_ibverbs_utils.yml
@@ -351,7 +322,6 @@ deployment_groups:
351322 vars:
352323 enable_ops_agent: true
353324 enable_nvidia_dcgm: true
354- enable_nvidia_persistenced: true
355325 tasks:
356326 - name: Update Ops Agent configuration
357327 ansible.builtin.blockinfile:
@@ -386,11 +356,6 @@ deployment_groups:
386356 name: nvidia-dcgm.service
387357 state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}"
388358 enabled: "{{ enable_nvidia_dcgm }}"
389- - name: Enable NVIDIA Persistence Daemon
390- ansible.builtin.service:
391- name: nvidia-persistenced.service
392- state: "{{ 'started' if enable_nvidia_persistenced else 'stopped' }}"
393- enabled: "{{ enable_nvidia_persistenced }}"
394359
395360 - id : a4high_nodeset
396361 source : community/modules/compute/schedmd-slurm-gcp-v6-nodeset
@@ -433,7 +398,7 @@ deployment_groups:
433398 is_default : true
434399 partition_conf :
435400 OverSubscribe : EXCLUSIVE
436- ResumeTimeout : 900
401+ ResumeTimeout : 1200
437402 SuspendTimeout : 600
438403
439404 - id : slurm_login
0 commit comments