Skip to content

Commit ed9f5b0

Browse files
authored
Merge pull request #482 from arnaldo2792/migrate-ecs-cdi
Migrate ECS to CDI
2 parents 78ba6d5 + 2374e11 commit ed9f5b0

5 files changed

+103
-1
lines changed

packages/docker-engine/daemon-nvidia-json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,9 @@ std = { version = "v1", helpers = ["join_array"] }
2525
{{/if}}
2626
{{/each}}
2727
{{/if}}
28-
"selinux-enabled": true
28+
"selinux-enabled": true,
29+
"features": {
30+
"cdi": true
31+
},
32+
"cdi-spec-dirs": ["/etc/cdi/"]
2933
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
From 018a8150cab0969c63b82a1e93047599e56de167 Mon Sep 17 00:00:00 2001
2+
From: Arnaldo Garcia Rincon <[email protected]>
3+
Date: Wed, 23 Apr 2025 01:09:31 +0000
4+
Subject: [PATCH] ldcache: fix parsing for aarch64
5+
6+
The architecture flag for aarch64 is currently missing from the
7+
supported architecture flags list. This omission causes the getEntries
8+
function to exclude all libraries found on aarch64 hosts. As a result
9+
helper programs like nvidia-ctk are unable to generate CDI
10+
specifications for the aarch64 architecture.
11+
12+
This fix adds the missing aarch64 architecture flag, using the same
13+
value as defined in libnvidia-container[1], which maintains a more
14+
comprehensive list of supported architectures.
15+
16+
[1]: https://github.com/NVIDIA/libnvidia-container/blob/a198166e1c1166f4847598438115ea97dacc7a92/src/ldcache.h#L21
17+
18+
Signed-off-by: Arnaldo Garcia Rincon <[email protected]>
19+
---
20+
internal/ldcache/ldcache.go | 3 +++
21+
1 file changed, 3 insertions(+)
22+
23+
diff --git a/internal/ldcache/ldcache.go b/internal/ldcache/ldcache.go
24+
index 4daf95b..455048c 100644
25+
--- a/internal/ldcache/ldcache.go
26+
+++ b/internal/ldcache/ldcache.go
27+
@@ -47,6 +47,7 @@ const (
28+
flagArchX8664 = 0x0300
29+
flagArchX32 = 0x0800
30+
flagArchPpc64le = 0x0500
31+
+ flagArchAarch64 = 0x0a00
32+
)
33+
34+
var errInvalidCache = errors.New("invalid ld.so.cache file")
35+
@@ -195,6 +196,8 @@ func (c *ldcache) getEntries() []entry {
36+
switch e.Flags & flagArchMask {
37+
case flagArchX8664:
38+
fallthrough
39+
+ case flagArchAarch64:
40+
+ fallthrough
41+
case flagArchPpc64le:
42+
bits = 64
43+
case flagArchX32:
44+
--
45+
2.49.0
46+
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
[Unit]
2+
Description=Generate CDI specifications
3+
# This has to be executed after the kernel modules are loaded
4+
# otherwise the userspace component of the driver will fail to
5+
# query the /dev devices
6+
After=load-tesla-kernel-modules.service load-open-gpu-kernel-modules.service
7+
# Running this unit after nvidia persistenced ensures that
8+
# the /dev devices are created and the hardware set to
9+
# persistence mode.
10+
Requires=nvidia-persistenced.service
11+
After=nvidia-persistenced.service
12+
# Block manual interactions with this service. It doesn't
13+
# make sense to regenerate CDI specifications as features
14+
# that might change the GPU (e.g. MIG) are not supported in
15+
# ECS
16+
RefuseManualStart=true
17+
RefuseManualStop=true
18+
19+
[Service]
20+
Type=oneshot
21+
# Explanation of the options:
22+
# --format json: to be consistent across Bottlerocket's variants
23+
# --mode nvml: the default mode ("auto") resolves to this already, make it explicit
24+
# --device-name-strategy uuid: the ECS agent only supports device UUIDs
25+
# --output /etc/cdi/nvidia.json: store the CDI specifications at this location
26+
ExecStart=/usr/bin/nvidia-ctk cdi generate --format json \
27+
--mode nvml \
28+
--device-name-strategy uuid \
29+
--output /etc/cdi/nvidia.json
30+
RemainAfterExit=true
31+
StandardError=journal+console
32+
33+
[Install]
34+
RequiredBy=preconfigured.target

packages/nvidia-container-toolkit/nvidia-container-toolkit-config-ecs.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@ root = "/"
33
path = "/usr/bin/nvidia-container-cli"
44
environment = []
55
ldconfig = "@/sbin/ldconfig"
6+
7+
[nvidia-container-runtime]
8+
mode = "cdi"

packages/nvidia-container-toolkit/nvidia-container-toolkit.spec

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ Source3: nvidia-gpu-devices.rules
2222
Source4: nvidia-container-toolkit-tmpfiles-ecs.conf
2323
Source5: nvidia-container-toolkit-tmpfiles-k8s.conf
2424
Source6: nvidia-container-toolkit-config-k8s
25+
Source7: generate-cdi-specs.service
26+
Patch0001: 0001-ldcache-fix-parsing-for-aarch64.patch
2527

2628
BuildRequires: %{_cross_os}glibc-devel
2729
Requires: %{_cross_os}libnvidia-container
@@ -34,6 +36,7 @@ Requires: (%{name}-k8s if %{_cross_os}variant-family(aws-k8s))
3436
%package ecs
3537
Summary: Files specific for the ECS variants
3638
Requires: %{name}
39+
Requires: %{name}-cdi-specs
3740
Conflicts: %{name}-k8s
3841

3942
%description ecs
@@ -47,6 +50,13 @@ Conflicts: %{name}-ecs
4750
%description k8s
4851
%{summary}.
4952

53+
%package cdi-specs
54+
Summary: Tools to generate CDI specifications
55+
Requires: %{name}
56+
57+
%description cdi-specs
58+
%{summary}.
59+
5060
%prep
5161
%autosetup -n %{gorepo}-%{gover} -p1
5262
%cross_go_setup %{gorepo}-%{gover} %{goproject} %{goimport}
@@ -69,6 +79,7 @@ install -d %{buildroot}%{_cross_bindir}
6979
install -d %{buildroot}%{_cross_tmpfilesdir}
7080
install -d %{buildroot}%{_cross_templatedir}
7181
install -d %{buildroot}%{_cross_udevrulesdir}
82+
install -d %{buildroot}%{_cross_unitdir}
7283
install -d %{buildroot}%{_cross_datadir}/nvidia-container-toolkit
7384
install -d %{buildroot}%{_cross_factorydir}/nvidia-container-runtime
7485
install -d %{buildroot}%{_cross_templatedir}/nvidia-container-runtime
@@ -82,6 +93,7 @@ install -p -m 0644 %{S:3} %{buildroot}%{_cross_udevrulesdir}/90-nvidia-gpu-devic
8293
install -m 0644 %{S:4} %{buildroot}%{_cross_tmpfilesdir}/nvidia-container-toolkit-ecs.conf
8394
install -m 0644 %{S:5} %{buildroot}%{_cross_tmpfilesdir}/nvidia-container-toolkit-k8s.conf
8495
install -m 0644 %{S:6} %{buildroot}%{_cross_templatedir}/nvidia-container-runtime/
96+
install -m 0644 %{S:7} %{buildroot}%{_cross_unitdir}/
8597

8698
%files
8799
%license LICENSE
@@ -100,3 +112,6 @@ install -m 0644 %{S:6} %{buildroot}%{_cross_templatedir}/nvidia-container-runtim
100112
%{_cross_factorydir}/nvidia-container-runtime/nvidia-container-toolkit-config-k8s.toml
101113
%{_cross_templatedir}/nvidia-container-runtime/nvidia-container-toolkit-config-k8s
102114
%{_cross_tmpfilesdir}/nvidia-container-toolkit-k8s.conf
115+
116+
%files cdi-specs
117+
%{_cross_unitdir}/generate-cdi-specs.service

0 commit comments

Comments
 (0)