Skip to content

Commit 95acb18

Browse files
authored
Merge pull request #1796 from cdesiniotis/use-hook-on-crio
Revert back to using the hook on cri-o when CDI is disabled
2 parents 2068177 + cb61abc commit 95acb18

File tree

2 files changed

+104
-16
lines changed

2 files changed

+104
-16
lines changed

controllers/object_controls.go

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
958958
return err
959959
}
960960

961-
setRuntimeClassName(&obj.Spec.Template.Spec, config)
961+
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
962962

963963
// update env required for MIG support
964964
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
@@ -1241,6 +1241,7 @@ func transformToolkitCtrForCDI(container *corev1.Container) {
12411241
setContainerEnv(container, CDIEnabledEnvName, "true")
12421242
setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false")
12431243
setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi")
1244+
setContainerEnv(container, CRIOConfigModeEnvName, "config")
12441245
}
12451246

12461247
// TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy
@@ -1283,6 +1284,18 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
12831284
// update env required for CDI support
12841285
if config.CDI.IsEnabled() {
12851286
transformToolkitCtrForCDI(toolkitMainContainer)
1287+
} else if n.runtime == gpuv1.CRIO {
1288+
// (cdesiniotis) When CDI is not enabled and cri-o is the container runtime,
1289+
// we continue to install the OCI prestart hook as opposed to adding nvidia
1290+
// runtime handlers to the cri-o configuration. Users can override this behavior
1291+
// and have nvidia runtime handlers added to the cri-o configuration by setting
1292+
// the 'CRIO_CONFIG_MODE' environment variable to 'config' in the toolkit container.
1293+
// However, one should note setting 'CRIO_CONFIG_MODE' to 'config' in this case
1294+
// (when CDI is not enabled) would result in the 'nvidia' runtime being set as
1295+
// the default runtime. While this should work in theory, it is a significant
1296+
// change -- which was the primary motivation to continue using the OCI prestart
1297+
// hook by default in this case.
1298+
setContainerEnv(toolkitMainContainer, CRIOConfigModeEnvName, "hook")
12861299
}
12871300

12881301
// set install directory for the toolkit
@@ -1337,11 +1350,6 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
13371350
setContainerEnv(container, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config))
13381351
}
13391352

1340-
if runtime == gpuv1.CRIO.String() {
1341-
// We add the nvidia runtime to the cri-o config by default instead of installing the OCI prestart hook
1342-
setContainerEnv(container, CRIOConfigModeEnvName, "config")
1343-
}
1344-
13451353
// For runtime config files we have top-level configs and drop-in files.
13461354
// These are supported as follows:
13471355
// * Docker only supports top-level config files.
@@ -1517,7 +1525,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
15171525
return err
15181526
}
15191527

1520-
setRuntimeClassName(&obj.Spec.Template.Spec, config)
1528+
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
15211529

15221530
// update env required for MIG support
15231531
applyMIGConfiguration(devicePluginMainContainer, config.MIG.Strategy)
@@ -1597,7 +1605,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
15971605
return err
15981606
}
15991607

1600-
setRuntimeClassName(&obj.Spec.Template.Spec, config)
1608+
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
16011609

16021610
// update env required for MIG support
16031611
applyMIGConfiguration(mpsControlMainContainer, config.MIG.Strategy)
@@ -1705,7 +1713,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
17051713
}
17061714
}
17071715

1708-
setRuntimeClassName(&obj.Spec.Template.Spec, config)
1716+
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
17091717

17101718
// set hostPID if specified for DCGM Exporter
17111719
if config.DCGMExporter.IsHostPIDEnabled() {
@@ -1830,7 +1838,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu
18301838
}
18311839
}
18321840

1833-
setRuntimeClassName(&obj.Spec.Template.Spec, config)
1841+
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
18341842

18351843
return nil
18361844
}
@@ -1872,7 +1880,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
18721880
obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args
18731881
}
18741882

1875-
setRuntimeClassName(&obj.Spec.Template.Spec, config)
1883+
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
18761884

18771885
// set ConfigMap name for "mig-parted-config" Volume
18781886
for i, vol := range obj.Spec.Template.Spec.Volumes {
@@ -2166,7 +2174,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
21662174
return fmt.Errorf("%v", err)
21672175
}
21682176

2169-
setRuntimeClassName(&obj.Spec.Template.Spec, config)
2177+
setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
21702178

21712179
var validatorErr error
21722180
// apply changes for individual component validators(initContainers)
@@ -2540,7 +2548,10 @@ func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string {
25402548
return DefaultRuntimeClass
25412549
}
25422550

2543-
func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec) {
2551+
func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) {
2552+
if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO {
2553+
return
2554+
}
25442555
runtimeClassName := getRuntimeClassName(config)
25452556
podSpec.RuntimeClassName = &runtimeClassName
25462557
}

controllers/transforms_test.go

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,6 @@ func TestTransformForRuntime(t *testing.T) {
478478
Name: "test-ctr",
479479
Env: []corev1.EnvVar{
480480
{Name: "RUNTIME", Value: gpuv1.CRIO.String()},
481-
{Name: CRIOConfigModeEnvName, Value: "config"},
482481
{Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"},
483482
{Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"},
484483
{Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"},
@@ -775,12 +774,14 @@ func TestTransformToolkit(t *testing.T) {
775774
description string
776775
ds Daemonset // Input DaemonSet
777776
cpSpec *gpuv1.ClusterPolicySpec // Input configuration
778-
expectedDs Daemonset // Expected output DaemonSet
777+
runtime gpuv1.Runtime
778+
expectedDs Daemonset // Expected output DaemonSet
779779
}{
780780
{
781781
description: "transform nvidia-container-toolkit-ctr container",
782782
ds: NewDaemonset().
783783
WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}),
784+
runtime: gpuv1.Containerd,
784785
cpSpec: &gpuv1.ClusterPolicySpec{
785786
Toolkit: gpuv1.ToolkitSpec{
786787
Repository: "nvcr.io/nvidia/cloud-native",
@@ -822,6 +823,7 @@ func TestTransformToolkit(t *testing.T) {
822823
{Name: CDIEnabledEnvName, Value: "true"},
823824
{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
824825
{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
826+
{Name: CRIOConfigModeEnvName, Value: "config"},
825827
{Name: "foo", Value: "bar"},
826828
{Name: "RUNTIME", Value: "containerd"},
827829
{Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"},
@@ -847,6 +849,7 @@ func TestTransformToolkit(t *testing.T) {
847849
description: "transform nvidia-container-toolkit-ctr container with custom ctr runtime socket",
848850
ds: NewDaemonset().
849851
WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}),
852+
runtime: gpuv1.Containerd,
850853
cpSpec: &gpuv1.ClusterPolicySpec{
851854
Toolkit: gpuv1.ToolkitSpec{
852855
Repository: "nvcr.io/nvidia/cloud-native",
@@ -899,6 +902,7 @@ func TestTransformToolkit(t *testing.T) {
899902
{Name: CDIEnabledEnvName, Value: "true"},
900903
{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
901904
{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
905+
{Name: CRIOConfigModeEnvName, Value: "config"},
902906
{Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"},
903907
{Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"},
904908
{Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"},
@@ -920,12 +924,84 @@ func TestTransformToolkit(t *testing.T) {
920924
WithHostPathVolume("containerd-socket", "/run/k3s/containerd", nil).
921925
WithPullSecret("pull-secret"),
922926
},
927+
{
928+
description: "transform nvidia-container-toolkit-ctr container, cri-o runtime, cdi enabled",
929+
ds: NewDaemonset().
930+
WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}),
931+
runtime: gpuv1.CRIO,
932+
cpSpec: &gpuv1.ClusterPolicySpec{
933+
Toolkit: gpuv1.ToolkitSpec{
934+
Repository: "nvcr.io/nvidia/cloud-native",
935+
Image: "nvidia-container-toolkit",
936+
Version: "v1.0.0",
937+
},
938+
},
939+
expectedDs: NewDaemonset().
940+
WithContainer(corev1.Container{
941+
Name: "nvidia-container-toolkit-ctr",
942+
Image: "nvcr.io/nvidia/cloud-native/nvidia-container-toolkit:v1.0.0",
943+
ImagePullPolicy: corev1.PullIfNotPresent,
944+
Env: []corev1.EnvVar{
945+
{Name: CDIEnabledEnvName, Value: "true"},
946+
{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
947+
{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
948+
{Name: CRIOConfigModeEnvName, Value: "config"},
949+
{Name: "RUNTIME", Value: gpuv1.CRIO.String()},
950+
{Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"},
951+
{Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"},
952+
{Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"},
953+
{Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"},
954+
},
955+
VolumeMounts: []corev1.VolumeMount{
956+
{Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir},
957+
{Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"},
958+
},
959+
}).
960+
WithHostPathVolume("crio-config", "/etc/crio", newHostPathType(corev1.HostPathDirectoryOrCreate)).
961+
WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", newHostPathType(corev1.HostPathDirectoryOrCreate)),
962+
},
963+
{
964+
description: "transform nvidia-container-toolkit-ctr container, cri-o runtime, cdi disabled",
965+
ds: NewDaemonset().
966+
WithContainer(corev1.Container{Name: "nvidia-container-toolkit-ctr"}),
967+
runtime: gpuv1.CRIO,
968+
cpSpec: &gpuv1.ClusterPolicySpec{
969+
Toolkit: gpuv1.ToolkitSpec{
970+
Repository: "nvcr.io/nvidia/cloud-native",
971+
Image: "nvidia-container-toolkit",
972+
Version: "v1.0.0",
973+
},
974+
CDI: gpuv1.CDIConfigSpec{
975+
Enabled: newBoolPtr(false),
976+
},
977+
},
978+
expectedDs: NewDaemonset().
979+
WithContainer(corev1.Container{
980+
Name: "nvidia-container-toolkit-ctr",
981+
Image: "nvcr.io/nvidia/cloud-native/nvidia-container-toolkit:v1.0.0",
982+
ImagePullPolicy: corev1.PullIfNotPresent,
983+
Env: []corev1.EnvVar{
984+
{Name: CRIOConfigModeEnvName, Value: "hook"},
985+
{Name: "RUNTIME", Value: gpuv1.CRIO.String()},
986+
{Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"},
987+
{Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"},
988+
{Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"},
989+
{Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"},
990+
},
991+
VolumeMounts: []corev1.VolumeMount{
992+
{Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir},
993+
{Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"},
994+
},
995+
}).
996+
WithHostPathVolume("crio-config", "/etc/crio", newHostPathType(corev1.HostPathDirectoryOrCreate)).
997+
WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", newHostPathType(corev1.HostPathDirectoryOrCreate)),
998+
},
923999
}
9241000

9251001
for _, tc := range testCases {
9261002
t.Run(tc.description, func(t *testing.T) {
9271003
controller := ClusterPolicyController{
928-
runtime: gpuv1.Containerd,
1004+
runtime: tc.runtime,
9291005
logger: ctrl.Log.WithName("test"),
9301006
}
9311007

@@ -2587,6 +2663,7 @@ func TestTransformToolkitCtrForCDI(t *testing.T) {
25872663
{Name: CDIEnabledEnvName, Value: "true"},
25882664
{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
25892665
{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
2666+
{Name: CRIOConfigModeEnvName, Value: "config"},
25902667
},
25912668
}),
25922669
},

0 commit comments

Comments
 (0)