Skip to content

Commit 243ed79

Browse files
Merge pull request #4409 from RachaelSTamakloe/hotfix-resolve-a3u-a4h-nvidia-version-mismatch
Resolve a3u/a4h slurm nvidia version mismatch error
2 parents 283dd26 + 39da67d commit 243ed79

File tree

2 files changed

+40
-4
lines changed

2 files changed

+40
-4
lines changed

examples/machine-learning/a3-ultragpu-8g/a3ultra-slurm-blueprint.yaml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ vars:
2424
# Image settings
2525
base_image:
2626
project: ubuntu-os-accelerator-images
27-
family: ubuntu-accelerator-2204-amd64-with-nvidia-570
27+
image: ubuntu-accelerator-2204-amd64-with-nvidia-570-v20250712
2828
image_build_machine_type: n2-standard-16
2929
build_slurm_from_git_ref: 6.10.0
3030
# Cluster env settings
@@ -147,6 +147,24 @@ deployment_groups:
147147
ansible.builtin.apt:
148148
deb: "{{ cuda_repo_filename }}"
149149
state: present
150+
# The following 2 tasks work around a temporary issue with Ubuntu
151+
# packaging of NVIDIA 570 driver series for kernel 6.8.0-1032
152+
# This command ensures that any holds are removed before attempting an upgrade.
153+
# We ignore failures in case the packages were not held.
154+
- name: Unhold NVIDIA driver packages
155+
ansible.builtin.command:
156+
cmd: apt-mark unhold linux-modules-nvidia-570-server-open-gcp linux-modules-nvidia-570-server-open-6.8.0-1032-gcp
157+
become: true
158+
changed_when: false
159+
failed_when: false
160+
- name: Install latest NVIDIA driver metapackage and kernel module
161+
ansible.builtin.apt:
162+
name:
163+
- linux-modules-nvidia-570-server-open-gcp
164+
- linux-modules-nvidia-570-server-open-6.8.0-1032-gcp
165+
state: latest
166+
update_cache: yes
167+
become: true
150168
- name: Reduce NVIDIA repository priority
151169
ansible.builtin.copy:
152170
dest: /etc/apt/preferences.d/cuda-repository-pin-600
@@ -236,7 +254,7 @@ deployment_groups:
236254
settings:
237255
disk_size: $(vars.disk_size_gb)
238256
machine_type: $(vars.image_build_machine_type)
239-
source_image_family: $(vars.base_image.family)
257+
source_image: $(vars.base_image.image)
240258
source_image_project_id: [$(vars.base_image.project)]
241259
image_family: $(vars.instance_image.family)
242260
omit_external_ip: false

examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ vars:
2424
# Image settings
2525
base_image:
2626
project: ubuntu-os-accelerator-images
27-
family: ubuntu-accelerator-2204-amd64-with-nvidia-570
27+
image: ubuntu-accelerator-2204-amd64-with-nvidia-570-v20250712
2828
image_build_machine_type: n2-standard-16
2929
build_slurm_from_git_ref: 6.10.0
3030
# Cluster env settings
@@ -148,6 +148,24 @@ deployment_groups:
148148
ansible.builtin.apt:
149149
deb: "{{ cuda_repo_filename }}"
150150
state: present
151+
# The following 2 tasks work around a temporary issue with Ubuntu
152+
# packaging of NVIDIA 570 driver series for kernel 6.8.0-1032
153+
# This command ensures that any holds are removed before attempting an upgrade.
154+
# We ignore failures in case the packages were not held.
155+
- name: Unhold NVIDIA driver packages
156+
ansible.builtin.command:
157+
cmd: apt-mark unhold linux-modules-nvidia-570-server-open-gcp linux-modules-nvidia-570-server-open-6.8.0-1032-gcp
158+
become: true
159+
changed_when: false
160+
failed_when: false
161+
- name: Install latest NVIDIA driver metapackage and kernel module
162+
ansible.builtin.apt:
163+
name:
164+
- linux-modules-nvidia-570-server-open-gcp
165+
- linux-modules-nvidia-570-server-open-6.8.0-1032-gcp
166+
state: latest
167+
update_cache: yes
168+
become: true
151169
- name: Reduce NVIDIA repository priority
152170
ansible.builtin.copy:
153171
dest: /etc/apt/preferences.d/cuda-repository-pin-600
@@ -237,7 +255,7 @@ deployment_groups:
237255
settings:
238256
disk_size: $(vars.disk_size_gb)
239257
machine_type: $(vars.image_build_machine_type)
240-
source_image_family: $(vars.base_image.family)
258+
source_image: $(vars.base_image.image)
241259
source_image_project_id: [$(vars.base_image.project)]
242260
image_family: $(vars.instance_image.family)
243261
omit_external_ip: false

0 commit comments

Comments
 (0)