Skip to content

Commit a1e3f78

Browse files
committed
nvidia-k8s-device-plugin: add two patches
0001: generate missing CDI specifications The Device Plugin doesn't generate CDI specifications for the "all" device, which is what would be used when the env variable NVIDIA_VISIBLE_DEVICES is set to "all". This breaks compatibility with containers that rely on bypassing Kubernetes directives to have access to all the GPUs (e.g. DCGM Exporter). Follow what nvidia-ctk[1] does to generate the CDI specifications for the "all" device. [1]: https://github.com/NVIDIA/nvidia-container-toolkit/blob/6394e9e9e7f8692438f2c921c0bbc95b72d693c6/cmd/nvidia-ctk/cdi/generate/generate.go#L300-L303 0002: fix ldcache parsing for aarch64 k8s-device-plugin carries its own nvidia-container-toolkit and uses nvidia-ctk to generate the CDI specifications. The architecture flag for aarch64 is currently missing from the supported architecture flags list. This omission causes the getEntries function to exclude all libraries found on aarch64 hosts. As a result helper programs like nvidia-ctk are unable to generate CDI specifications for the aarch64 architecture. This fix adds the missing aarch64 architecture flag, using the same value as defined in libnvidia-container[1], which maintains a more comprehensive list of supported architectures. [1]: https://github.com/NVIDIA/libnvidia-container/blob/a198166e1c1166f4847598438115ea97dacc7a92/src/ldcache.h#L21 Signed-off-by: Jingwei Wang <[email protected]>
1 parent 6f014eb commit a1e3f78

File tree

3 files changed

+96
-0
lines changed

3 files changed

+96
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
From 21bc8cf75a8f868d0f2d806652394f3aeb8f9f7d Mon Sep 17 00:00:00 2001
2+
From: Jingwei Wang <[email protected]>
3+
Date: Tue, 13 May 2025 21:21:51 +0000
4+
Subject: [PATCH] Add CDI specs for the "all" device
5+
6+
The Device Plugin cannot generate CDI specifications
7+
when NVIDIA_VISIBLE_DEVICES is set to "all". This limitation breaks compatibility
8+
with containers that depend on this setting, e.g. DCGM Exporter.
9+
10+
Fix this by mimicking what nvidia-ctk does[1], and add the "all" device
11+
to the CDI specifications.
12+
13+
[1]: https://github.com/NVIDIA/nvidia-container-toolkit/blob/6394e9e9e7f8692438f2c921c0bbc95b72d693c6/cmd/nvidia-ctk/cdi/generate/generate.go#L300-L303
14+
15+
Signed-off-by: Jingwei Wang <[email protected]>
16+
---
17+
internal/cdi/cdi.go | 5 +++++
18+
1 file changed, 5 insertions(+)
19+
20+
diff --git a/internal/cdi/cdi.go b/internal/cdi/cdi.go
21+
index 8813768..09c4ccf 100644
22+
--- a/internal/cdi/cdi.go
23+
+++ b/internal/cdi/cdi.go
24+
@@ -38,6 +38,7 @@ import (
25+
26+
const (
27+
cdiRoot = "/var/run/cdi"
28+
+ allDeviceName = "all"
29+
)
30+
31+
// cdiHandler creates CDI specs for devices assocatied with the device plugin
32+
@@ -124,6 +125,10 @@ func New(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interf
33+
nvcdi.WithDeviceNamers(deviceNamer),
34+
nvcdi.WithVendor(c.vendor),
35+
nvcdi.WithClass("gpu"),
36+
+ nvcdi.WithMergedDeviceOptions(
37+
+ transform.WithName(allDeviceName),
38+
+ transform.WithSkipIfExists(true),
39+
+ ),
40+
// TODO: This should be removed once the use of a NVIDIA Container Toolkit >= v1.17.5 is commonplace.
41+
nvcdi.WithDisabledHook(nvcdi.HookEnableCudaCompat),
42+
)
43+
--
44+
2.47.0
45+
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
From be4ba83b821eea9050eefdb7e67df2d757c3795a Mon Sep 17 00:00:00 2001
2+
From: Jingwei Wang <[email protected]>
3+
Date: Wed, 23 Apr 2025 17:17:35 +0000
4+
Subject: [PATCH] fix ldcache parsing for aarch64
5+
6+
k8s-device-plugin carries its own nvidia-container-toolkit and uses
7+
nvidia-ctk to generate the CDI specifications.
8+
9+
The architecture flag for aarch64 is currently missing from the
10+
supported architecture flags list. This omission causes the getEntries
11+
function to exclude all libraries found on aarch64 hosts. As a result
12+
helper programs like nvidia-ctk are unable to generate CDI
13+
specifications for the aarch64 architecture.
14+
15+
This fix adds the missing aarch64 architecture flag, using the same
16+
value as defined in libnvidia-container[1], which maintains a more
17+
comprehensive list of supported architectures.
18+
19+
[1]: https://github.com/NVIDIA/libnvidia-container/blob/a198166e1c1166f4847598438115ea97dacc7a92/src/ldcache.h#L21
20+
21+
Signed-off-by: Jingwei Wang <[email protected]>
22+
---
23+
.../nvidia-container-toolkit/internal/ldcache/ldcache.go | 3 +++
24+
1 file changed, 3 insertions(+)
25+
26+
diff --git a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache/ldcache.go b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache/ldcache.go
27+
index 4daf95b..455048c 100644
28+
--- a/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache/ldcache.go
29+
+++ b/vendor/github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache/ldcache.go
30+
@@ -47,6 +47,7 @@ const (
31+
flagArchX8664 = 0x0300
32+
flagArchX32 = 0x0800
33+
flagArchPpc64le = 0x0500
34+
+ flagArchAarch64 = 0x0a00
35+
)
36+
37+
var errInvalidCache = errors.New("invalid ld.so.cache file")
38+
@@ -195,6 +196,8 @@ func (c *ldcache) getEntries() []entry {
39+
switch e.Flags & flagArchMask {
40+
case flagArchX8664:
41+
fallthrough
42+
+ case flagArchAarch64:
43+
+ fallthrough
44+
case flagArchPpc64le:
45+
bits = 64
46+
case flagArchX32:
47+
--
48+
2.47.0
49+

packages/nvidia-k8s-device-plugin/nvidia-k8s-device-plugin.spec

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ Source2: nvidia-k8s-device-plugin-conf
1818
Source3: nvidia-k8s-device-plugin-exec-start-conf
1919
Source4: nvidia-k8s-device-plugin-mig-conf
2020

21+
Patch0001: 0001-Add-CDI-specs-for-the-all-device.patch
22+
Patch0002: 0002-fix-ldcache-parsing-for-aarch64.patch
2123

2224
BuildRequires: %{_cross_os}glibc-devel
2325
Requires: %{name}(binaries)

0 commit comments

Comments
 (0)