Skip to content

Commit 3289f0b

Browse files
committed
Hold all nvidia software to the same version
Without this, during any combination of "update & upgrade", parts of the nvidia software stack are liable to be upgraded and become out of sync. While only libnvidia-compute-570-server causes immediate errors, it is best to keep everything in sync with the image until a point where an upgrade across all instances can be done.
1 parent 38b433a commit 3289f0b

File tree

5 files changed

+170
-59
lines changed

5 files changed

+170
-59
lines changed

examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,36 @@ deployment_groups:
8484
Package: nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1
8585
Pin: version 1.17.7-1
8686
Pin-Priority: 100
87+
88+
# The following holds NVIDIA software that was already installed on the
89+
# accelerator base image to be the same driver version. This reduces the
90+
# risk of a driver version mismatch.
91+
# Additional packages are held by:
92+
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/ansible/group_vars/os_ubuntu.yml
93+
- type: ansible-local
94+
destination: hold-nvidia-packages.yml
95+
content: |
96+
---
97+
- name: Hold nvidia packages
98+
hosts: all
99+
become: true
100+
vars:
101+
nvidia_packages_to_hold:
102+
- libnvidia-cfg1-*-server
103+
- libnvidia-compute-*-server
104+
- libnvidia-nscq-*
105+
- nvidia-compute-utils-*-server
106+
- nvidia-fabricmanager-*
107+
- nvidia-utils-*-server
108+
tasks:
109+
- name: Hold nvidia packages
110+
ansible.builtin.command:
111+
argv:
112+
- apt-mark
113+
- hold
114+
- "{{ item }}"
115+
loop: "{{ nvidia_packages_to_hold }}"
116+
87117
# it is important that kernel upgrades do not occur before running the
88118
# solution for building Slurm (which doesn't handle them well on the fly)
89119
# if you follow this rule, any module which supports DKMS will be
@@ -181,7 +211,7 @@ deployment_groups:
181211
destination: configure_gpu_monitoring.yml
182212
content: |
183213
---
184-
- name: Install NVIDIA DCGM and Configure Ops Agent
214+
- name: Install CUDA & DCGM & Configure Ops Agent
185215
hosts: all
186216
become: true
187217
vars:
@@ -193,9 +223,6 @@ deployment_groups:
193223
nvidia_packages:
194224
- cuda-toolkit-12-8
195225
- datacenter-gpu-manager-4-cuda12
196-
- libnvidia-cfg1-570-server
197-
- libnvidia-nscq-570
198-
- nvidia-compute-utils-570-server
199226
tasks:
200227
- name: Download NVIDIA repository package
201228
ansible.builtin.get_url:
@@ -223,16 +250,11 @@ deployment_groups:
223250
Package: *
224251
Pin: release l=NVIDIA CUDA
225252
Pin-Priority: 400
226-
- name: Install NVIDIA fabric and CUDA
253+
- name: Install CUDA & DCGM
227254
ansible.builtin.apt:
228255
name: "{{ item }}"
229256
update_cache: true
230257
loop: "{{ nvidia_packages }}"
231-
- name: Freeze NVIDIA fabric and CUDA
232-
ansible.builtin.dpkg_selections:
233-
name: "{{ item }}"
234-
selection: hold
235-
loop: "{{ nvidia_packages }}"
236258
- name: Create nvidia-persistenced override directory
237259
ansible.builtin.file:
238260
path: /etc/systemd/system/nvidia-persistenced.service.d
@@ -271,6 +293,7 @@ deployment_groups:
271293
name: nvidia-persistenced.service
272294
state: stopped
273295
enabled: false
296+
274297
- type: ansible-local
275298
destination: install_dmabuf.yml
276299
content: |

examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,36 @@ deployment_groups:
6969
Package: nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1
7070
Pin: version 1.17.7-1
7171
Pin-Priority: 100
72+
73+
# The following holds NVIDIA software that was already installed on the
74+
# accelerator base image to be the same driver version. This reduces the
75+
# risk of a driver version mismatch.
76+
# Additional packages are held by:
77+
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/ansible/group_vars/os_ubuntu.yml
78+
- type: ansible-local
79+
destination: hold-nvidia-packages.yml
80+
content: |
81+
---
82+
- name: Hold nvidia packages
83+
hosts: all
84+
become: true
85+
vars:
86+
nvidia_packages_to_hold:
87+
- libnvidia-cfg1-*-server
88+
- libnvidia-compute-*-server
89+
- libnvidia-nscq-*
90+
- nvidia-compute-utils-*-server
91+
- nvidia-fabricmanager-*
92+
- nvidia-utils-*-server
93+
tasks:
94+
- name: Hold nvidia packages
95+
ansible.builtin.command:
96+
argv:
97+
- apt-mark
98+
- hold
99+
- "{{ item }}"
100+
loop: "{{ nvidia_packages_to_hold }}"
101+
72102
- type: data
73103
destination: /var/tmp/slurm_vars.json
74104
content: |
@@ -93,7 +123,7 @@ deployment_groups:
93123
-i localhost, --limit localhost --connection=local \
94124
-e @/var/tmp/slurm_vars.json \
95125
ansible/playbook.yml
96-
# this duplicates the ulimits configuration of the HPC VM Image
126+
# this duplicates the ulimits configuration of the HPC VM Image
97127
- type: data
98128
destination: /etc/security/limits.d/99-unlimited.conf
99129
content: |
@@ -103,11 +133,12 @@ deployment_groups:
103133
* - nofile 1048576
104134
* - cpu unlimited
105135
* - rtprio unlimited
136+
106137
- type: ansible-local
107-
destination: configure_gpu.yml
138+
destination: install_cuda_dcgm.yml
108139
content: |
109140
---
110-
- name: Install NVIDIA packages
141+
- name: Install CUDA & DCGM
111142
hosts: all
112143
become: true
113144
vars:
@@ -118,9 +149,6 @@ deployment_groups:
118149
nvidia_packages:
119150
- cuda-toolkit-12-8
120151
- datacenter-gpu-manager-4-cuda12
121-
- libnvidia-cfg1-570-server
122-
- libnvidia-nscq-570
123-
- nvidia-compute-utils-570-server
124152
tasks:
125153
- name: Download NVIDIA repository package
126154
ansible.builtin.get_url:
@@ -130,7 +158,6 @@ deployment_groups:
130158
ansible.builtin.apt:
131159
deb: "{{ cuda_repo_filename }}"
132160
state: present
133-
134161
- name: Reduce NVIDIA repository priority
135162
ansible.builtin.copy:
136163
dest: /etc/apt/preferences.d/cuda-repository-pin-600
@@ -149,16 +176,11 @@ deployment_groups:
149176
Package: *
150177
Pin: release l=NVIDIA CUDA
151178
Pin-Priority: 400
152-
- name: Install NVIDIA fabric and CUDA
179+
- name: Install CUDA & DCGM
153180
ansible.builtin.apt:
154181
name: "{{ item }}"
155182
update_cache: true
156183
loop: "{{ nvidia_packages }}"
157-
- name: Freeze NVIDIA fabric and CUDA
158-
ansible.builtin.dpkg_selections:
159-
name: "{{ item }}"
160-
selection: hold
161-
loop: "{{ nvidia_packages }}"
162184
- name: Create nvidia-persistenced override directory
163185
ansible.builtin.file:
164186
path: /etc/systemd/system/nvidia-persistenced.service.d

examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,36 @@ deployment_groups:
7070
Package: nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1
7171
Pin: version 1.17.7-1
7272
Pin-Priority: 100
73+
74+
# The following holds NVIDIA software that was already installed on the
75+
# accelerator base image to be the same driver version. This reduces the
76+
# risk of a driver version mismatch.
77+
# Additional packages are held by:
78+
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/ansible/group_vars/os_ubuntu.yml
79+
- type: ansible-local
80+
destination: hold-nvidia-packages.yml
81+
content: |
82+
---
83+
- name: Hold nvidia packages
84+
hosts: all
85+
become: true
86+
vars:
87+
nvidia_packages_to_hold:
88+
- libnvidia-cfg1-*-server
89+
- libnvidia-compute-*-server
90+
- libnvidia-nscq-*
91+
- nvidia-compute-utils-*-server
92+
- nvidia-fabricmanager-*
93+
- nvidia-utils-*-server
94+
tasks:
95+
- name: Hold nvidia packages
96+
ansible.builtin.command:
97+
argv:
98+
- apt-mark
99+
- hold
100+
- "{{ item }}"
101+
loop: "{{ nvidia_packages_to_hold }}"
102+
73103
- type: data
74104
destination: /var/tmp/slurm_vars.json
75105
content: |
@@ -104,11 +134,12 @@ deployment_groups:
104134
* - nofile 1048576
105135
* - cpu unlimited
106136
* - rtprio unlimited
137+
107138
- type: ansible-local
108-
destination: configure_gpu.yml
139+
destination: install_cuda_dcgm.yml
109140
content: |
110141
---
111-
- name: Install NVIDIA packages
142+
- name: Install CUDA & DCGM
112143
hosts: all
113144
become: true
114145
vars:
@@ -119,9 +150,6 @@ deployment_groups:
119150
nvidia_packages:
120151
- cuda-toolkit-12-8
121152
- datacenter-gpu-manager-4-cuda12
122-
- libnvidia-cfg1-570-server
123-
- libnvidia-nscq-570
124-
- nvidia-compute-utils-570-server
125153
tasks:
126154
- name: Download NVIDIA repository package
127155
ansible.builtin.get_url:
@@ -131,7 +159,6 @@ deployment_groups:
131159
ansible.builtin.apt:
132160
deb: "{{ cuda_repo_filename }}"
133161
state: present
134-
135162
- name: Reduce NVIDIA repository priority
136163
ansible.builtin.copy:
137164
dest: /etc/apt/preferences.d/cuda-repository-pin-600
@@ -150,16 +177,11 @@ deployment_groups:
150177
Package: *
151178
Pin: release l=NVIDIA CUDA
152179
Pin-Priority: 400
153-
- name: Install NVIDIA fabric and CUDA
180+
- name: Install CUDA & DCGM
154181
ansible.builtin.apt:
155182
name: "{{ item }}"
156183
update_cache: true
157184
loop: "{{ nvidia_packages }}"
158-
- name: Freeze NVIDIA fabric and CUDA
159-
ansible.builtin.dpkg_selections:
160-
name: "{{ item }}"
161-
selection: hold
162-
loop: "{{ nvidia_packages }}"
163185
- name: Create nvidia-persistenced override directory
164186
ansible.builtin.file:
165187
path: /etc/systemd/system/nvidia-persistenced.service.d
@@ -225,7 +247,6 @@ deployment_groups:
225247
source_image_project_id: [$(vars.base_image.project)]
226248
image_family: $(vars.instance_image.family)
227249
omit_external_ip: false
228-
229250
# Unattended upgrades are disabled in this blueprint so that software does not
230251
# get updated daily and lead to potential instability in the cluster environment.
231252
#

examples/machine-learning/build-service-images/a3m/blueprint.yaml

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,36 @@ deployment_groups:
3737
Package: nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1
3838
Pin: version 1.17.7-1
3939
Pin-Priority: 100
40+
41+
# The following holds NVIDIA software that was already installed on the
42+
# accelerator base image to be the same driver version. This reduces the
43+
# risk of a driver version mismatch.
44+
# Additional packages are held by:
45+
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/ansible/group_vars/os_ubuntu.yml
46+
- type: ansible-local
47+
destination: hold-nvidia-packages.yml
48+
content: |
49+
---
50+
- name: Hold nvidia packages
51+
hosts: all
52+
become: true
53+
vars:
54+
nvidia_packages_to_hold:
55+
- libnvidia-cfg1-*-server
56+
- libnvidia-compute-*-server
57+
- libnvidia-nscq-*
58+
- nvidia-compute-utils-*-server
59+
- nvidia-fabricmanager-*
60+
- nvidia-utils-*-server
61+
tasks:
62+
- name: Hold nvidia packages
63+
ansible.builtin.command:
64+
argv:
65+
- apt-mark
66+
- hold
67+
- "{{ item }}"
68+
loop: "{{ nvidia_packages_to_hold }}"
69+
4070
- type: shell
4171
destination: hold_google_services.sh
4272
content: |
@@ -118,7 +148,7 @@ deployment_groups:
118148
destination: configure_gpu_monitoring.yml
119149
content: |
120150
---
121-
- name: Install NVIDIA DCGM and Configure Ops Agent
151+
- name: Install CUDA & DCGM & Configure Ops Agent
122152
hosts: all
123153
become: true
124154
vars:
@@ -130,9 +160,6 @@ deployment_groups:
130160
nvidia_packages:
131161
- cuda-toolkit-12-8
132162
- datacenter-gpu-manager-4-cuda12
133-
- libnvidia-cfg1-570-server
134-
- libnvidia-nscq-570
135-
- nvidia-compute-utils-570-server
136163
tasks:
137164
- name: Download NVIDIA repository package
138165
ansible.builtin.get_url:
@@ -160,16 +187,11 @@ deployment_groups:
160187
Package: *
161188
Pin: release l=NVIDIA CUDA
162189
Pin-Priority: 400
163-
- name: Install NVIDIA fabric and CUDA
190+
- name: Install CUDA & DCGM
164191
ansible.builtin.apt:
165192
name: "{{ item }}"
166193
update_cache: true
167194
loop: "{{ nvidia_packages }}"
168-
- name: Freeze NVIDIA fabric and CUDA
169-
ansible.builtin.dpkg_selections:
170-
name: "{{ item }}"
171-
selection: hold
172-
loop: "{{ nvidia_packages }}"
173195
- name: Create nvidia-persistenced override directory
174196
ansible.builtin.file:
175197
path: /etc/systemd/system/nvidia-persistenced.service.d
@@ -208,6 +230,7 @@ deployment_groups:
208230
name: nvidia-persistenced.service
209231
state: stopped
210232
enabled: false
233+
211234
- type: ansible-local
212235
destination: install_dmabuf.yml
213236
content: |
@@ -300,7 +323,7 @@ deployment_groups:
300323
disk_size: 100
301324
machine_type: c2-standard-8
302325

303-
source_image: ubuntu-accelerator-2204-amd64-with-nvidia-570-v20250712
326+
source_image: ubuntu-accelerator-2204-amd64-with-nvidia-570-v20250722
304327
source_image_project_id: [ubuntu-os-accelerator-images]
305328

306329
image_family: $(vars.family)

0 commit comments

Comments
 (0)