-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnvidia-updated.yml
355 lines (305 loc) · 13.5 KB
/
nvidia-updated.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
---
# Simple NVIDIA Driver Installation Playbook for Ubuntu/Debian
# Optimized for headless compute servers (Tdarr, Plex, Frigate)
- name: NVIDIA Driver Installation
hosts: "{{ target_hosts | default('all') }}"
become: true
gather_facts: true
vars:
# Configuration variables
nvidia_driver_version: latest # Can be set to specific version like "525"
restart_after_install: "{{ var_reboot | default('false') }}" # Get reboot preference from Semaphore
var_environment: "{{ var_environment | default('test') }}" # Get environment from Semaphore
tasks:
- name: Check if NVIDIA GPU is present
shell: lspci | grep -i nvidia
register: nvidia_check
changed_when: false
failed_when: false
- name: Set NVIDIA present fact
set_fact:
nvidia_present: "{{ nvidia_check.rc == 0 }}"
- name: Display NVIDIA GPU detection result
debug:
msg: "NVIDIA GPU {{ 'detected' if nvidia_present else 'not detected' }}"
- name: Display the current environment mode
debug:
msg: >
{% if var_environment == "production" %}
"Running in PRODUCTION mode: Will install the driver if newer."
{% else %}
"Running in TEST mode: Will only print driver versions."
{% endif %}
- name: Exit playbook if no NVIDIA GPU detected
meta: end_play
when: not nvidia_present
# Check current driver status
- name: Check if NVIDIA driver is already installed
shell: "command -v nvidia-smi && nvidia-smi"
register: nvidia_smi_result
changed_when: false
failed_when: false
- name: Set driver installed fact
set_fact:
driver_installed: "{{ nvidia_smi_result.rc == 0 }}"
- name: Get current driver version if installed
shell: "nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1"
register: current_driver_version
when: driver_installed
changed_when: false
failed_when: false
- name: Display current driver information
debug:
msg: "NVIDIA driver {{ current_driver_version.stdout | default('not installed') }}"
when: driver_installed
- name: Find available NVIDIA drivers
shell: ubuntu-drivers devices | grep -i nvidia
register: nvidia_drivers_output
changed_when: false
failed_when: false
- name: Display all available NVIDIA drivers
debug:
var: nvidia_drivers_output.stdout_lines
- name: Extract recommended NVIDIA driver
shell: ubuntu-drivers devices | grep "recommended" | awk '{print $3}'
register: recommended_driver
changed_when: false
failed_when: false
- name: Display recommended driver information
debug:
msg: "Recommended NVIDIA driver: {{ recommended_driver.stdout | default('No recommended driver found') }}"
- name: Set driver package - use recommended if found
set_fact:
nvidia_driver_package: "{{ recommended_driver.stdout }}"
when: recommended_driver.stdout != ""
- name: Set driver package - use server driver if no recommendation
set_fact:
nvidia_driver_package: "nvidia-driver-{{ nvidia_driver_version }}-server"
when: recommended_driver.stdout == ""
- name: Display driver to be installed
debug:
msg: "Driver package to install: {{ nvidia_driver_package }}"
# Ubuntu/Debian driver installation
- name: Install required dependencies
apt:
name:
- software-properties-common
- ubuntu-drivers-common
- build-essential
- dkms
state: present
- name: Add NVIDIA PPA repository
apt_repository:
repo: ppa:graphics-drivers/ppa
state: present
- name: Update apt cache
apt:
update_cache: yes
# Install NVIDIA driver
- name: Install NVIDIA driver
apt:
name: "{{ nvidia_driver_package }}"
state: present
update_cache: yes
register: driver_install
when:
- not driver_installed or (driver_installed and recommended_driver.stdout != current_driver_version.stdout)
- var_environment == "production"
# Display CUDA information if driver is installed
- name: Check NVIDIA driver and CUDA version
shell: "nvidia-smi"
register: nvidia_smi_output
changed_when: false
failed_when: false
when: driver_installed
- name: Display NVIDIA driver and CUDA information
debug:
msg: "{{ nvidia_smi_output.stdout_lines | default(['NVIDIA information not available']) }}"
when: driver_installed and nvidia_smi_output.stdout is defined
# Configure for compute optimization with display
- name: Create X11 configuration directory
file:
path: /etc/X11/xorg.conf.d
state: directory
mode: '0755'
when:
- driver_install.changed
- var_environment == "production"
register: x11_config_dir
- name: Explain X11 configuration skip reason
debug:
msg: "Skipping X11 configuration directory creation: {% if not driver_install.changed %}Driver not being installed or updated{% elif var_environment != 'production' %}Running in TEST mode{% else %}Unknown reason{% endif %}"
when:
- driver_install is defined
- not (driver_install.changed and var_environment == "production")
- name: Create GPU optimization configuration
copy:
content: |
Section "Device"
Identifier "NVIDIA"
Driver "nvidia"
Option "NoLogo" "true"
# Not using UseDisplayDevice "None" since PiKVM is connected
Option "RegistryDwords" "PerfLevelSrc=0x2222"
Option "TripleBuffer" "True"
EndSection
dest: /etc/X11/xorg.conf.d/10-nvidia-compute.conf
mode: '0644'
when:
- driver_install.changed
- var_environment == "production"
- name: Explain GPU optimization configuration skip reason
debug:
msg: "Skipping GPU optimization configuration: {% if not driver_install.changed %}Driver not being installed or updated{% elif var_environment != 'production' %}Running in TEST mode{% else %}Unknown reason{% endif %}"
when:
- driver_install is defined
- not (driver_install.changed and var_environment == "production")
# NVENC configuration for transcoding
- name: Ensure NVENC is enabled for transcoding
lineinfile:
path: /etc/modprobe.d/nvidia.conf
line: "options nvidia NVreg_RestrictProfilingToAdminUsers=0"
create: yes
mode: '0644'
when:
- driver_install.changed
- var_environment == "production"
- name: Explain NVENC configuration skip reason
debug:
msg: "Skipping NVENC configuration: {% if not driver_install.changed %}Driver not being installed or updated{% elif var_environment != 'production' %}Running in TEST mode{% else %}Unknown reason{% endif %}"
when:
- driver_install is defined
- not (driver_install.changed and var_environment == "production")
# Check if Docker is installed
- name: Check if Docker is installed
command: which docker
register: docker_check
changed_when: false
failed_when: false
- name: Set Docker installed fact
set_fact:
docker_installed: "{{ docker_check.rc == 0 }}"
# Docker GPU passthrough setup - Updated section based on Server World guide
- name: Install NVIDIA Container Toolkit
block:
# Step 1: Install curl to download the repository
- name: Ensure curl is installed
apt:
name: curl
state: present
# Step 2: Add NVIDIA Container Toolkit repository
- name: Add NVIDIA Container Toolkit repository
shell: |
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
args:
creates: /etc/apt/sources.list.d/nvidia-container-toolkit.list
# Step 3: Update package lists
- name: Update apt cache after adding NVIDIA repository
apt:
update_cache: yes
# Step 4: Install NVIDIA Container Toolkit
- name: Install NVIDIA Container Toolkit
apt:
name: nvidia-container-toolkit
state: present
# Step 5: Configure the Docker daemon
- name: Configure Docker for NVIDIA GPU
shell: nvidia-ctk runtime configure --runtime=docker
register: nvidia_ctk_config
changed_when: nvidia_ctk_config.rc == 0
# Step 6: Restart Docker
- name: Restart Docker service
systemd:
name: docker
state: restarted
failed_when: false
when: nvidia_ctk_config is changed
# Step 7: Verify the installation
- name: Test NVIDIA runtime with Docker
shell: docker run --rm --gpus all ubuntu nvidia-smi
register: docker_nvidia_test
changed_when: false
failed_when: false
- name: Display NVIDIA Docker test results
debug:
msg: "{{ docker_nvidia_test.stdout_lines | default(['NVIDIA Docker test failed']) }}"
when: docker_nvidia_test.rc == 0
- name: Display NVIDIA Docker test error
debug:
msg: "NVIDIA Docker test failed with error: {{ docker_nvidia_test.stderr | default('Unknown error') }}"
when: docker_nvidia_test.rc != 0
when:
- driver_installed or driver_install.changed
- docker_installed
- var_environment == "production"
- name: Explain NVIDIA Container Toolkit skip reason
debug:
msg: >
Skipping NVIDIA Container Toolkit installation:
{% if not driver_installed and not driver_install.changed %}No NVIDIA driver installed{%
elif not docker_installed %}Docker is not installed{%
elif var_environment != 'production' %}Running in TEST mode{%
else %}Unknown reason{% endif %}
when:
- not (driver_installed or driver_install.changed) or not docker_installed or var_environment != "production"
# Persistence mode for faster GPU response
- name: Create nvidia-persistenced configuration directory
file:
path: /etc/nvidia-persistenced
state: directory
mode: '0755'
when:
- driver_install.changed
- var_environment == "production"
- name: Explain persistence directory skip reason
debug:
msg: "Skipping nvidia-persistenced directory creation: {% if not driver_install.changed %}Driver not being installed or updated{% elif var_environment != 'production' %}Running in TEST mode{% else %}Unknown reason{% endif %}"
when:
- driver_install is defined
- not (driver_install.changed and var_environment == "production")
- name: Configure persistence mode settings
copy:
content: "persistence-mode=1"
dest: /etc/nvidia-persistenced/configuration
mode: '0644'
when:
- driver_install.changed
- var_environment == "production"
- name: Explain persistence mode settings skip reason
debug:
msg: "Skipping persistence mode configuration: {% if not driver_install.changed %}Driver not being installed or updated{% elif var_environment != 'production' %}Running in TEST mode{% else %}Unknown reason{% endif %}"
when:
- driver_install is defined
- not (driver_install.changed and var_environment == "production")
- name: Enable Nvidia Persistence Daemon
systemd:
name: nvidia-persistenced
enabled: yes
state: started
failed_when: false
when:
- driver_install.changed
- var_environment == "production"
- name: Explain persistence daemon enabling skip reason
debug:
msg: "Skipping nvidia-persistenced daemon activation: {% if not driver_install.changed %}Driver not being installed or updated{% elif var_environment != 'production' %}Running in TEST mode{% else %}Unknown reason{% endif %}"
when:
- driver_install is defined
- not (driver_install.changed and var_environment == "production")
# Final restart if needed
- name: Restart system if required
reboot:
reboot_timeout: 600
when:
- restart_after_install == "true"
- driver_install.changed
- var_environment == "production"
- name: Explain system restart skip reason
debug:
msg: "Skipping system restart: {% if restart_after_install != 'true' %}Restart not enabled in configuration{% elif not driver_install.changed %}Driver not being installed or updated{% elif var_environment != 'production' %}Running in TEST mode{% else %}Unknown reason{% endif %}"
when:
- driver_install is defined
- not (restart_after_install == "true" and driver_install.changed and var_environment == "production")