diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 8578e7259..b919598f0 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -70,8 +70,9 @@ type options struct { featureFlags []string csv struct { - files []string - ignorePatterns []string + files []string + ignorePatterns []string + CompatContainerRoot string } noAllDevice bool @@ -212,6 +213,12 @@ func (m command) build() *cli.Command { Destination: &opts.csv.ignorePatterns, Sources: cli.EnvVars("NVIDIA_CTK_CDI_GENERATE_CSV_IGNORE_PATTERNS"), }, + &cli.StringFlag{ + Name: "csv.compat-container-root", + Usage: "specify the container folder to use for CUDA Forward Compatibility in non-standard containers", + Destination: &opts.csv.CompatContainerRoot, + Sources: cli.EnvVars("NVIDIA_CTK_CDI_GENERATE_CSV_CONTAINER_COMPAT_ROOT"), + }, &cli.StringSliceFlag{ Name: "disable-hook", Aliases: []string{"disable-hooks"}, @@ -384,6 +391,7 @@ func (m command) generateSpecs(opts *options) ([]generatedSpecs, error) { nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths), nvcdi.WithCSVFiles(opts.csv.files), nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns), + nvcdi.WithCSVCompatContainerRoot(opts.csv.CompatContainerRoot), nvcdi.WithDisabledHooks(opts.disabledHooks...), nvcdi.WithEnabledHooks(opts.enabledHooks...), nvcdi.WithFeatureFlags(opts.featureFlags...), diff --git a/internal/config/runtime.go b/internal/config/runtime.go index 5df04e90f..946f75a23 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -53,6 +53,9 @@ type jitCDIModeConfig struct { type csvModeConfig struct { MountSpecPath string `toml:"mount-spec-path"` + // CompatContainerRoot specifies the compat root used when the the standard + // CUDA compat libraries should not be used. + CompatContainerRoot string `toml:"compat-container-root,omitempty"` } type legacyModeConfig struct { diff --git a/internal/modifier/csv.go b/internal/modifier/csv.go index c8cf4ead3..66db40387 100644 --- a/internal/modifier/csv.go +++ b/internal/modifier/csv.go @@ -59,6 +59,7 @@ func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path), nvcdi.WithMode(nvcdi.ModeCSV), nvcdi.WithCSVFiles(csvFiles), + nvcdi.WithCSVCompatContainerRoot(cfg.NVIDIAContainerRuntimeConfig.Modes.CSV.CompatContainerRoot), ) if err != nil { return nil, fmt.Errorf("failed to construct CDI library: %v", err) diff --git a/pkg/nvcdi/lib-csv.go b/pkg/nvcdi/lib-csv.go index 94543e0a3..ef2281077 100644 --- a/pkg/nvcdi/lib-csv.go +++ b/pkg/nvcdi/lib-csv.go @@ -33,14 +33,36 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra" + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" ) -type csvlib nvcdilib +const ( + defaultOrinCompatContainerRoot = "/usr/local/cuda/compat-orin" +) + +type csvOptions struct { + Files []string + IgnorePatterns []string + CompatContainerRoot string +} +type csvlib nvcdilib type mixedcsvlib nvcdilib var _ deviceSpecGeneratorFactory = (*csvlib)(nil) +// asCSVLib sets any CSV-specific defaults and casts the nvcdilib instance as a +// *csvlib. +func (l *nvcdilib) asCSVLib() *csvlib { + if len(l.csv.Files) == 0 { + l.csv.Files = csv.DefaultFileList() + } + if l.csv.CompatContainerRoot == "" { + l.csv.CompatContainerRoot = defaultOrinCompatContainerRoot + } + return (*csvlib)(l) +} + // DeviceSpecGenerators creates a set of generators for the specified set of // devices. // If NVML is not available or the disable-multiple-csv-devices feature flag is @@ -171,7 +193,7 @@ func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) { func (l *csvDeviceGenerator) deviceNodeMountSpecs() tegra.MountSpecPathsByTyper { mountSpecs := tegra.Transform( - tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), + tegra.MountSpecsFromCSVFiles(l.logger, l.csv.Files...), // We remove non-device nodes. tegra.OnlyDeviceNodes(), ) @@ -388,10 +410,10 @@ func isIntegratedGPU(d nvml.Device) (bool, error) { func (l *csvlib) driverDiscoverer() (discover.Discover, error) { mountSpecs := tegra.Transform( tegra.Transform( - tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), + tegra.MountSpecsFromCSVFiles(l.logger, l.csv.Files...), tegra.WithoutDeviceNodes(), ), - tegra.IgnoreSymlinkMountSpecsByPattern(l.csvIgnorePatterns...), + tegra.IgnoreSymlinkMountSpecsByPattern(l.csv.IgnorePatterns...), ) driverDiscoverer, err := tegra.New( tegra.WithLogger(l.logger), @@ -467,7 +489,7 @@ func (l *csvlib) cudaCompatDiscoverer() discover.Discover { // TODO: Should this be overridable through a feature flag / config option? if strings.Contains(name, "Orin (nvgpu)") { // TODO: This should probably be a constant or configurable. - cudaCompatContainerRoot = "/usr/local/cuda/compat-orin" + cudaCompatContainerRoot = l.csv.CompatContainerRoot break } } diff --git a/pkg/nvcdi/lib-csv_test.go b/pkg/nvcdi/lib-csv_test.go index 6810b4c3f..7fef56253 100644 --- a/pkg/nvcdi/lib-csv_test.go +++ b/pkg/nvcdi/lib-csv_test.go @@ -63,39 +63,50 @@ func TestDeviceSpecGenerators(t *testing.T) { infolib: &infoInterfaceMock{ HasNvmlFunc: func() (bool, string) { return true, "forced" }, }, - // TODO: Replace this with a system-specific implementation once available. - nvmllib: &mock.Interface{ - InitFunc: func() nvml.Return { - return nvml.SUCCESS - }, - ShutdownFunc: func() nvml.Return { - return nvml.SUCCESS - }, - SystemGetDriverVersionFunc: func() (string, nvml.Return) { - return "540.3.0", nvml.SUCCESS - }, - DeviceGetCountFunc: func() (int, nvml.Return) { - return 1, nvml.SUCCESS + nvmllib: mockOrinServer(), + }, + expectedDeviceSpecs: []specs.Device{ + { + Name: "0", + ContainerEdits: specs.ContainerEdits{ + DeviceNodes: []*specs.DeviceNode{ + {Path: "/dev/nvidia0", HostPath: "/dev/nvidia0"}, + }, }, - DeviceGetHandleByIndexFunc: func(n int) (nvml.Device, nvml.Return) { - if n != 0 { - return nil, nvml.ERROR_INVALID_ARGUMENT - } - device := &mock.Device{ - GetUUIDFunc: func() (string, nvml.Return) { - return "GPU-orin", nvml.SUCCESS - }, - GetNameFunc: func() (string, nvml.Return) { - return "Orin (nvgpu)", nvml.SUCCESS - }, - GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { - return nvml.PciInfo{}, nvml.ERROR_NOT_SUPPORTED - }, - } - return device, nvml.SUCCESS + }, + }, + expectedCommonEdits: &cdi.ContainerEdits{ + ContainerEdits: &specs.ContainerEdits{ + Hooks: []*specs.Hook{ + { + HookName: "createContainer", + Path: "/usr/bin/nvidia-cdi-hook", + Args: []string{"nvidia-cdi-hook", "enable-cuda-compat", "--host-driver-version=540.3.0", "--cuda-compat-container-root=/usr/local/cuda/compat-orin"}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + { + HookName: "createContainer", + Path: "/usr/bin/nvidia-cdi-hook", + Args: []string{"nvidia-cdi-hook", "update-ldcache"}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, }, }, }, + }, + { + description: "single orin CSV device; custom container compat root", + rootfsFolder: "rootfs-orin", + lib: &csvlib{ + // test-case specific + infolib: &infoInterfaceMock{ + HasNvmlFunc: func() (bool, string) { return true, "forced" }, + }, + nvmllib: mockOrinServer(), + csv: csvOptions{ + CompatContainerRoot: "/another/compat/root", + }, + }, expectedDeviceSpecs: []specs.Device{ { Name: "0", @@ -112,7 +123,7 @@ func TestDeviceSpecGenerators(t *testing.T) { { HookName: "createContainer", Path: "/usr/bin/nvidia-cdi-hook", - Args: []string{"nvidia-cdi-hook", "enable-cuda-compat", "--host-driver-version=540.3.0", "--cuda-compat-container-root=/usr/local/cuda/compat-orin"}, + Args: []string{"nvidia-cdi-hook", "enable-cuda-compat", "--host-driver-version=540.3.0", "--cuda-compat-container-root=/another/compat/root"}, Env: []string{"NVIDIA_CTK_DEBUG=false"}, }, { @@ -188,10 +199,13 @@ func TestDeviceSpecGenerators(t *testing.T) { tc.lib.driverRoot = driverRoot tc.lib.devRoot = driverRoot - tc.lib.csvFiles = []string{ + tc.lib.csv.Files = []string{ filepath.Join(driverRoot, "/etc/nvidia-container-runtime/host-files-for-container.d/devices.csv"), filepath.Join(driverRoot, "/etc/nvidia-container-runtime/host-files-for-container.d/drivers.csv"), } + if tc.lib.csv.CompatContainerRoot == "" { + tc.lib.csv.CompatContainerRoot = defaultOrinCompatContainerRoot + } t.Run(tc.description, func(t *testing.T) { generator, err := tc.lib.DeviceSpecGenerators("all") @@ -230,6 +244,41 @@ func stripRoot[T any](root string, v T) T { return modified } +// TODO: We should move this mock to go-nvml/mock +func mockOrinServer() nvml.Interface { + return &mock.Interface{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + SystemGetDriverVersionFunc: func() (string, nvml.Return) { + return "540.3.0", nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 1, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(n int) (nvml.Device, nvml.Return) { + if n != 0 { + return nil, nvml.ERROR_INVALID_ARGUMENT + } + device := &mock.Device{ + GetUUIDFunc: func() (string, nvml.Return) { + return "GPU-orin", nvml.SUCCESS + }, + GetNameFunc: func() (string, nvml.Return) { + return "Orin (nvgpu)", nvml.SUCCESS + }, + GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { + return nvml.PciInfo{}, nvml.ERROR_NOT_SUPPORTED + }, + } + return device, nvml.SUCCESS + }, + } +} + // TODO: We should move this mock to go-nvml/mock func mockIGXServer() nvml.Interface { thor := &mock.Device{ diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 4369a7215..fe54540e0 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -27,7 +27,6 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" - "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform" ) @@ -45,8 +44,7 @@ type nvcdilib struct { configSearchPaths []string librarySearchPaths []string - csvFiles []string - csvIgnorePatterns []string + csv csvOptions vendor string class string @@ -115,10 +113,7 @@ func New(opts ...Option) (Interface, error) { var factory deviceSpecGeneratorFactory switch l.resolveMode() { case ModeCSV: - if len(l.csvFiles) == 0 { - l.csvFiles = csv.DefaultFileList() - } - factory = (*csvlib)(l) + factory = l.asCSVLib() case ModeManagement: if l.vendor == "" { l.vendor = "management.nvidia.com" diff --git a/pkg/nvcdi/options.go b/pkg/nvcdi/options.go index eab27f05d..6af7a8492 100644 --- a/pkg/nvcdi/options.go +++ b/pkg/nvcdi/options.go @@ -131,14 +131,22 @@ func WithMergedDeviceOptions(opts ...transform.MergedDeviceOption) Option { // WithCSVFiles sets the CSV files for the library func WithCSVFiles(csvFiles []string) Option { return func(o *nvcdilib) { - o.csvFiles = csvFiles + o.csv.Files = csvFiles } } // WithCSVIgnorePatterns sets the ignore patterns for entries in the CSV files. func WithCSVIgnorePatterns(csvIgnorePatterns []string) Option { return func(o *nvcdilib) { - o.csvIgnorePatterns = csvIgnorePatterns + o.csv.IgnorePatterns = csvIgnorePatterns + } +} + +// WithCSVCompatContainerRoot sets the compat root to use for the container in +// the case of nvgpu-only devices. +func WithCSVCompatContainerRoot(csvCompatContainerRoot string) Option { + return func(o *nvcdilib) { + o.csv.CompatContainerRoot = csvCompatContainerRoot } }