Skip to content

Commit e83fd33

Browse files
committed
Add cuda-compat-mode config option
Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 2e225f1 commit e83fd33

File tree

7 files changed

+89
-41
lines changed

7 files changed

+89
-41
lines changed

cmd/nvidia-container-runtime-hook/main.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,15 @@ func doPrestart() {
114114
}
115115
args = append(args, "configure")
116116

117-
if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
117+
switch cli.CUDACompatMode {
118+
case config.CUDACompatModeMount:
119+
args = append(args, "--cuda-compat-mode=mount")
120+
case config.CUDACompatModeLdconfig, "":
121+
args = append(args, "--cuda-compat-mode=ldconfig")
122+
default:
118123
args = append(args, "--no-cntlibs")
119124
}
125+
120126
if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" {
121127
args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath))
122128
}

cmd/nvidia-ctk-installer/main_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
5353
swarm-resource = ""
5454
5555
[nvidia-container-cli]
56+
cuda-compat-mode = "ldconfig"
5657
debug = ""
5758
environment = []
5859
ldcache = ""
@@ -114,6 +115,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
114115
swarm-resource = ""
115116
116117
[nvidia-container-cli]
118+
cuda-compat-mode = "ldconfig"
117119
debug = ""
118120
environment = []
119121
ldcache = ""
@@ -178,6 +180,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
178180
swarm-resource = ""
179181
180182
[nvidia-container-cli]
183+
cuda-compat-mode = "ldconfig"
181184
debug = ""
182185
environment = []
183186
ldcache = ""
@@ -239,6 +242,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
239242
swarm-resource = ""
240243
241244
[nvidia-container-cli]
245+
cuda-compat-mode = "ldconfig"
242246
debug = ""
243247
environment = []
244248
ldcache = ""
@@ -322,6 +326,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
322326
swarm-resource = ""
323327
324328
[nvidia-container-cli]
329+
cuda-compat-mode = "ldconfig"
325330
debug = ""
326331
environment = []
327332
ldcache = ""

internal/config/cli.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ import (
2222
"strings"
2323
)
2424

25+
const (
26+
CUDACompatModeMount = "mount"
27+
CUDACompatModeLdconfig = "ldconfig"
28+
CUDACompatModeHook = "hook"
29+
CUDACompatModeDisabled = "disabled"
30+
)
31+
2532
// ContainerCLIConfig stores the options for the nvidia-container-cli
2633
type ContainerCLIConfig struct {
2734
Root string `toml:"root"`
@@ -44,6 +51,9 @@ type ContainerCLIConfig struct {
4451
// is required, the features.allow-ldconfig-from-container feature gate must
4552
// be enabled explicitly.
4653
Ldconfig ldconfigPath `toml:"ldconfig"`
54+
// CUDACompatMode sets the mode to be used to make CUDA Forward Compat
55+
// libraries discoverable in the container.
56+
CUDACompatMode string `toml:"cuda-compat-mode,omitempty"`
4757
}
4858

4959
// NormalizeLDConfigPath returns the resolved path of the configured LDConfig binary.

internal/config/config.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,10 @@ func GetDefault() (*Config, error) {
100100
AcceptEnvvarUnprivileged: true,
101101
SupportedDriverCapabilities: image.SupportedDriverCapabilities.String(),
102102
NVIDIAContainerCLIConfig: ContainerCLIConfig{
103-
LoadKmods: true,
104-
Ldconfig: getLdConfigPath(),
105-
User: getUserGroup(),
103+
LoadKmods: true,
104+
Ldconfig: getLdConfigPath(),
105+
User: getUserGroup(),
106+
CUDACompatMode: CUDACompatModeLdconfig,
106107
},
107108
NVIDIACTKConfig: CTKConfig{
108109
Path: nvidiaCTKExecutable,

internal/config/config_test.go

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,10 @@ func TestGetConfig(t *testing.T) {
5656
AcceptEnvvarUnprivileged: true,
5757
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
5858
NVIDIAContainerCLIConfig: ContainerCLIConfig{
59-
Root: "",
60-
LoadKmods: true,
61-
Ldconfig: "@/test/ld/config/path",
59+
Root: "",
60+
LoadKmods: true,
61+
Ldconfig: "@/test/ld/config/path",
62+
CUDACompatMode: "ldconfig",
6263
},
6364
NVIDIAContainerRuntimeConfig: RuntimeConfig{
6465
DebugFilePath: "/dev/null",
@@ -93,6 +94,7 @@ func TestGetConfig(t *testing.T) {
9394
"nvidia-container-cli.load-kmods = false",
9495
"nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"",
9596
"nvidia-container-cli.user = \"foo:bar\"",
97+
"nvidia-container-cli.cuda-compat-mode = \"mount\"",
9698
"nvidia-container-runtime.debug = \"/foo/bar\"",
9799
"nvidia-container-runtime.discover-mode = \"not-legacy\"",
98100
"nvidia-container-runtime.log-level = \"debug\"",
@@ -109,10 +111,11 @@ func TestGetConfig(t *testing.T) {
109111
AcceptEnvvarUnprivileged: false,
110112
SupportedDriverCapabilities: "compute,utility",
111113
NVIDIAContainerCLIConfig: ContainerCLIConfig{
112-
Root: "/bar/baz",
113-
LoadKmods: false,
114-
Ldconfig: "@/foo/bar/ldconfig",
115-
User: "foo:bar",
114+
Root: "/bar/baz",
115+
LoadKmods: false,
116+
Ldconfig: "@/foo/bar/ldconfig",
117+
User: "foo:bar",
118+
CUDACompatMode: "mount",
116119
},
117120
NVIDIAContainerRuntimeConfig: RuntimeConfig{
118121
DebugFilePath: "/foo/bar",
@@ -156,8 +159,9 @@ func TestGetConfig(t *testing.T) {
156159
AcceptEnvvarUnprivileged: true,
157160
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
158161
NVIDIAContainerCLIConfig: ContainerCLIConfig{
159-
Ldconfig: "/foo/bar/ldconfig",
160-
LoadKmods: true,
162+
Ldconfig: "/foo/bar/ldconfig",
163+
LoadKmods: true,
164+
CUDACompatMode: "ldconfig",
161165
},
162166
NVIDIAContainerRuntimeConfig: RuntimeConfig{
163167
DebugFilePath: "/dev/null",
@@ -200,6 +204,7 @@ func TestGetConfig(t *testing.T) {
200204
"root = \"/bar/baz\"",
201205
"load-kmods = false",
202206
"ldconfig = \"@/foo/bar/ldconfig\"",
207+
"cuda-compat-mode = \"mount\"",
203208
"user = \"foo:bar\"",
204209
"[nvidia-container-runtime]",
205210
"debug = \"/foo/bar\"",
@@ -222,10 +227,11 @@ func TestGetConfig(t *testing.T) {
222227
AcceptEnvvarUnprivileged: false,
223228
SupportedDriverCapabilities: "compute,utility",
224229
NVIDIAContainerCLIConfig: ContainerCLIConfig{
225-
Root: "/bar/baz",
226-
LoadKmods: false,
227-
Ldconfig: "@/foo/bar/ldconfig",
228-
User: "foo:bar",
230+
Root: "/bar/baz",
231+
LoadKmods: false,
232+
Ldconfig: "@/foo/bar/ldconfig",
233+
CUDACompatMode: "mount",
234+
User: "foo:bar",
229235
},
230236
NVIDIAContainerRuntimeConfig: RuntimeConfig{
231237
DebugFilePath: "/foo/bar",
@@ -264,10 +270,11 @@ func TestGetConfig(t *testing.T) {
264270
AcceptEnvvarUnprivileged: true,
265271
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
266272
NVIDIAContainerCLIConfig: ContainerCLIConfig{
267-
Root: "",
268-
LoadKmods: true,
269-
Ldconfig: "@/test/ld/config/path",
270-
User: "root:video",
273+
Root: "",
274+
LoadKmods: true,
275+
Ldconfig: "@/test/ld/config/path",
276+
CUDACompatMode: "ldconfig",
277+
User: "root:video",
271278
},
272279
NVIDIAContainerRuntimeConfig: RuntimeConfig{
273280
DebugFilePath: "/dev/null",
@@ -303,10 +310,11 @@ func TestGetConfig(t *testing.T) {
303310
AcceptEnvvarUnprivileged: true,
304311
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
305312
NVIDIAContainerCLIConfig: ContainerCLIConfig{
306-
Root: "",
307-
LoadKmods: true,
308-
Ldconfig: "@/test/ld/config/path",
309-
User: "foo:bar",
313+
Root: "",
314+
LoadKmods: true,
315+
Ldconfig: "@/test/ld/config/path",
316+
CUDACompatMode: "ldconfig",
317+
User: "foo:bar",
310318
},
311319
NVIDIAContainerRuntimeConfig: RuntimeConfig{
312320
DebugFilePath: "/dev/null",

internal/config/toml_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
4848
#swarm-resource = "DOCKER_RESOURCE_GPU"
4949
5050
[nvidia-container-cli]
51+
cuda-compat-mode = "ldconfig"
5152
#debug = "/var/log/nvidia-container-toolkit.log"
5253
environment = []
5354
#ldcache = "/etc/ld.so.cache"

internal/modifier/gated.go

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
7979
discoverers = append(discoverers, d)
8080
}
8181

82-
if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
83-
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
84-
discoverers = append(discoverers, compatLibHookDiscoverer)
85-
// For legacy mode, we also need to inject a hook to update the LDCache
86-
// after we have modifed the configuration.
87-
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
88-
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
89-
logger,
90-
discover.None{},
91-
cfg.NVIDIACTKConfig.Path,
92-
"",
93-
)
94-
if err != nil {
95-
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
96-
}
97-
discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
82+
// If the feature flag has explicitly been toggled, we don't make any modification.
83+
if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
84+
cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver)
85+
if err != nil {
86+
return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err)
9887
}
88+
discoverers = append(discoverers, cudaCompatDiscoverer)
9989
}
10090

10191
return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
10292
}
93+
94+
func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) {
95+
// For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook.
96+
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerCLIConfig.CUDACompatMode != config.CUDACompatModeHook {
97+
return nil, nil
98+
}
99+
100+
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
101+
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
102+
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
103+
return compatLibHookDiscoverer, nil
104+
}
105+
106+
// For legacy mode, we also need to inject a hook to update the LDCache
107+
// after we have modifed the configuration.
108+
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
109+
logger,
110+
discover.None{},
111+
cfg.NVIDIACTKConfig.Path,
112+
"",
113+
)
114+
if err != nil {
115+
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
116+
}
117+
118+
return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil
119+
}

0 commit comments

Comments
 (0)