-
Notifications
You must be signed in to change notification settings - Fork 492
Expand file tree
/
Copy pathnvidia-cdi-refresh_test.go
More file actions
243 lines (202 loc) · 7.81 KB
/
nvidia-cdi-refresh_test.go
File metadata and controls
243 lines (202 loc) · 7.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package e2e
import (
"context"
"fmt"
"strings"
"time"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
const (
getSystemStateScript = `systemctl is-system-running 2>/dev/null`
setSystemdDegradedScript = `#!/usr/bin/env bash
# Start the dummy service to force systemd to enter a degraded state
cat <<EOF > /etc/systemd/system/dummy.service
[Unit]
Description=Dummy systemd service
[Service]
Type=oneshot
ExecStart=/usr/bin/sh -c "exit 1"
EOF
# We know the dummy service will fail, so we can ignore the error
systemctl start --now dummy.service 2>/dev/null || true
`
fixSystemDegradedScript = `#!/usr/bin/env bash
# Start the dummy service to force systemd to enter a degraded state
cat <<EOF > /etc/systemd/system/dummy.service
[Unit]
Description=Dummy systemd service
[Service]
Type=oneshot
ExecStart=/usr/bin/sh -c "exit 0"
EOF
systemctl daemon-reload
systemctl start --now dummy.service 2>/dev/null || true
rm -rf /etc/systemd/system/dummy.service
systemctl daemon-reload
`
nvidiaCdiRefreshPathActiveTemplate = `
if ! systemctl status nvidia-cdi-refresh.path | grep "Active: active"; then
echo "nvidia-cdi-refresh.path is not Active"
exit 1
fi
`
nvidiaCdiRefreshServiceLoadedTemplate = `
if ! systemctl status nvidia-cdi-refresh.service | grep "Loaded: loaded"; then
echo "nvidia-cdi-refresh.service is not loaded"
exit 1
fi
`
nvidiaCdiRefreshFileExistsTemplate = `
# is /var/run/cdi/nvidia.yaml exists? and exit with 0 if it does not exist
if [ ! -f /var/run/cdi/nvidia.yaml ]; then
echo "nvidia.yaml file does not exist"
exit 1
fi
# generate the nvidia.yaml file
nvidia-ctk cdi generate --output=/tmp/nvidia.yaml
# diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same
if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then
echo "nvidia.yaml file is different"
exit 1
fi
`
nvidiaCdiRefreshUpgradeTemplate = `
# remove the generated files
rm /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml
# Simulate a binary upgrade by removing and recreating the file
# This mimics package manager behavior during upgrades
NVIDIA_CTK_PATH=$(which nvidia-ctk)
# Create a backup
cp "${NVIDIA_CTK_PATH}" "${NVIDIA_CTK_PATH}.backup"
# Remove the original (simulating uninstall/upgrade phase)
rm -f "${NVIDIA_CTK_PATH}"
# Small delay to ensure systemd detects the removal
sleep 0.1
# Restore from backup (simulating install phase)
cp "${NVIDIA_CTK_PATH}.backup" "${NVIDIA_CTK_PATH}"
# Ensure executable permissions
chmod +x "${NVIDIA_CTK_PATH}"
# Clean up backup
rm -f "${NVIDIA_CTK_PATH}.backup"
# wait for systemd path unit to detect the change and trigger the service
sleep 3
# Check if the file /var/run/cdi/nvidia.yaml is created
if [ ! -f /var/run/cdi/nvidia.yaml ]; then
echo "nvidia.yaml file is not created after updating the modules.dep file"
exit 1
fi
# generate the nvidia.yaml file
nvidia-ctk cdi generate --output=/tmp/nvidia.yaml
# diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same
if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then
echo "nvidia.yaml file is different"
exit 1
fi
`
)
var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("systemd-unit"), func() {
var (
containerName = "nvctk-e2e-nvidia-cdi-refresh-tests"
systemdRunner Runner
// TODO(@ArangoGutierrez): https://github.com/NVIDIA/nvidia-container-toolkit/pull/1235/files#r2302013660
outerContainerImage = "docker.io/kindest/base:v20250521-31a79fd4"
)
BeforeAll(func(ctx context.Context) {
var err error
// TODO: We set installCTK to true here to SKIP the mounting of the files from the host.
// The test here does NOT require the host toolkit.
systemdRunner, err = NewNestedContainerRunner(runner, outerContainerImage, true, containerName, localCacheDir)
Expect(err).ToNot(HaveOccurred())
for range 10 {
state, _, err := systemdRunner.Run(getSystemStateScript)
if err == nil {
GinkgoLogr.Info("systemd started", "state", state)
break
}
GinkgoLogr.Error(err, "systemctl state")
time.Sleep(1 * time.Second)
}
})
AfterAll(func(ctx context.Context) {
// Cleanup: remove the container and the temporary script on the host.
// Use || true to ensure cleanup doesn't fail the test
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName))
})
When("installing nvidia-container-toolkit", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := toolkitInstaller.Install(systemdRunner)
Expect(err).ToNot(HaveOccurred())
output, _, err := systemdRunner.Run("nvidia-ctk --version")
Expect(err).ToNot(HaveOccurred())
GinkgoLogr.Info("using nvidia-ctk", "version", strings.TrimSpace(output))
})
AfterAll(func(ctx context.Context) {
_, _, err := systemdRunner.Run("apt-get purge -y libnvidia-container* nvidia-container-toolkit*")
Expect(err).ToNot(HaveOccurred())
})
It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshPathActiveTemplate)
Expect(err).ToNot(HaveOccurred())
})
It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate)
Expect(err).ToNot(HaveOccurred())
})
It("should generate the nvidia.yaml file", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
Expect(err).ToNot(HaveOccurred())
})
It("should refresh the nvidia.yaml file after upgrading the nvidia-container-toolkit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshUpgradeTemplate)
Expect(err).ToNot(HaveOccurred())
})
})
When("installing nvidia-container-toolkit on a system with a degraded systemd", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := systemdRunner.Run(setSystemdDegradedScript)
Expect(err).ToNot(HaveOccurred())
_, _, err = systemdRunner.Run(getSystemStateScript)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("degraded"))
})
AfterAll(func(ctx context.Context) {
_, _, err := systemdRunner.Run(fixSystemDegradedScript)
Expect(err).ToNot(HaveOccurred())
state, _, err := systemdRunner.Run(getSystemStateScript)
Expect(err).ToNot(HaveOccurred())
Expect(strings.TrimSpace(state)).To(Equal("running"))
})
It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshPathActiveTemplate)
Expect(err).ToNot(HaveOccurred())
})
It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate)
Expect(err).ToNot(HaveOccurred())
})
It("should generate the nvidia.yaml file", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
Expect(err).ToNot(HaveOccurred())
})
It("should generate the nvidia.yaml file", func(ctx context.Context) {
_, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
Expect(err).ToNot(HaveOccurred())
})
})
})