Skip to content

Commit a575634

Browse files
harshthakkar01RachaelSTamakloe
authored andcommitted
Merge pull request #3729 from samskillman/fix/a4-nvidia-utils
nvidia compute server variants not yet available for 570
1 parent 855c8c0 commit a575634

File tree

1 file changed

+1
-36
lines changed

1 file changed

+1
-36
lines changed

examples/machine-learning/a4-highgpu-8g/a4high-slurm-blueprint.yaml

Lines changed: 1 addition & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,7 @@ deployment_groups:
113113
- nvidia-container-toolkit
114114
- cuda-toolkit-12-8
115115
- datacenter-gpu-manager
116-
- libnvidia-cfg1-570-server
117116
- libnvidia-nscq-570
118-
- nvidia-compute-utils-570-server
119117
tasks:
120118
- name: Download NVIDIA repository package
121119
ansible.builtin.get_url:
@@ -135,39 +133,12 @@ deployment_groups:
135133
name: "{{ item }}"
136134
selection: hold
137135
loop: "{{ nvidia_packages }}"
138-
- name: Create nvidia-persistenced override directory
139-
ansible.builtin.file:
140-
path: /etc/systemd/system/nvidia-persistenced.service.d
141-
state: directory
142-
owner: root
143-
group: root
144-
mode: 0o755
145-
- name: Configure nvidia-persistenced override
146-
ansible.builtin.copy:
147-
dest: /etc/systemd/system/nvidia-persistenced.service.d/persistence_mode.conf
148-
owner: root
149-
group: root
150-
mode: 0o644
151-
content: |
152-
[Service]
153-
ExecStart=
154-
ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --verbose
155-
notify: Reload SystemD
156-
handlers:
157-
- name: Reload SystemD
158-
ansible.builtin.systemd:
159-
daemon_reload: true
160136
post_tasks:
161137
- name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes)
162138
ansible.builtin.service:
163139
name: nvidia-dcgm.service
164140
state: stopped
165141
enabled: false
166-
- name: Disable nvidia-persistenced SystemD unit (enable during boot on GPU nodes)
167-
ansible.builtin.service:
168-
name: nvidia-persistenced.service
169-
state: stopped
170-
enabled: false
171142
172143
- type: ansible-local
173144
destination: install_ibverbs_utils.yml
@@ -351,7 +322,6 @@ deployment_groups:
351322
vars:
352323
enable_ops_agent: true
353324
enable_nvidia_dcgm: true
354-
enable_nvidia_persistenced: true
355325
tasks:
356326
- name: Update Ops Agent configuration
357327
ansible.builtin.blockinfile:
@@ -386,11 +356,6 @@ deployment_groups:
386356
name: nvidia-dcgm.service
387357
state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}"
388358
enabled: "{{ enable_nvidia_dcgm }}"
389-
- name: Enable NVIDIA Persistence Daemon
390-
ansible.builtin.service:
391-
name: nvidia-persistenced.service
392-
state: "{{ 'started' if enable_nvidia_persistenced else 'stopped' }}"
393-
enabled: "{{ enable_nvidia_persistenced }}"
394359
395360
- id: a4high_nodeset
396361
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
@@ -433,7 +398,7 @@ deployment_groups:
433398
is_default: true
434399
partition_conf:
435400
OverSubscribe: EXCLUSIVE
436-
ResumeTimeout: 900
401+
ResumeTimeout: 1200
437402
SuspendTimeout: 600
438403

439404
- id: slurm_login

0 commit comments

Comments
 (0)