diff --git a/assets/state-device-plugin/0500_daemonset.yaml b/assets/state-device-plugin/0500_daemonset.yaml index c4d85adfb..150c2b5fe 100644 --- a/assets/state-device-plugin/0500_daemonset.yaml +++ b/assets/state-device-plugin/0500_daemonset.yaml @@ -35,6 +35,32 @@ spec: - name: run-nvidia-validations mountPath: /run/nvidia/validations mountPropagation: HostToContainer + # The cdi-validation init container will only be added when CDI is enabled in the operator. + # This init container ensures that the device-plugin does not start until a version of + # the toolkit container that has CDI enabled is running. This is implemented to prevent + # a possible race condition where a newer version of the device-plugin, with CDI enabled, + # starts running while an older version of the toolkit, which does not have CDI enabled, + # is still running. + - image: "FILLED BY THE OPERATOR" + name: cdi-validation + command: [ 'sh', '-c' ] + args: + - | + if [ -z "${NVIDIA_CTK_LIBCUDA_DIR}" ]; then + echo "waiting for NVIDIA Container Toolkit to be installed with CDI enabled" + echo "will sleep and then exit" + sleep 5 + exit 1 + fi + env: + - name: NVIDIA_VISIBLE_DEVICES + value: all + securityContext: + privileged: true + volumeMounts: + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: HostToContainer - image: "FILLED BY THE OPERATOR" name: config-manager-init command: ["config-manager"] diff --git a/controllers/object_controls.go b/controllers/object_controls.go index dfe9c924c..f72ab6e66 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1506,6 +1506,15 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe // update env required for CDI support if config.CDI.IsEnabled() { transformDevicePluginCtrForCDI(devicePluginMainContainer, config) + } else { + // remove the "cdi-validation" init container when CDI is not enabled + cdiValidationContainerName := "cdi-validation" + for i, container := range obj.Spec.Template.Spec.InitContainers { + if container.Name != cdiValidationContainerName { + continue + } + obj.Spec.Template.Spec.InitContainers = append(obj.Spec.Template.Spec.InitContainers[:i], obj.Spec.Template.Spec.InitContainers[i+1:]...) + } } // update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index eb933fc48..8477997a6 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -950,6 +950,36 @@ func TestTransformDevicePlugin(t *testing.T) { }, }).WithRuntimeClassName("nvidia"), }, + { + description: "transform device plugin, CDI is disabled", + ds: NewDaemonset(). + WithInitContainer(corev1.Container{Name: "dummy"}). + WithInitContainer(corev1.Container{Name: "cdi-validation"}). + WithContainer(corev1.Container{Name: "nvidia-device-plugin"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DevicePlugin: gpuv1.DevicePluginSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "nvidia-device-plugin", + Version: "v1.0.0", + }, + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(false), + }, + Validator: gpuv1.ValidatorSpec{ + Repository: "nvcr.io/nvidia", + Image: "validator", + Version: "v1.0.0", + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "nvidia-device-plugin", + Image: "nvcr.io/nvidia/cloud-native/nvidia-device-plugin:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: "NVIDIA_MIG_MONITOR_DEVICES", Value: "all"}, + }, + }).WithInitContainer(corev1.Container{Name: "dummy"}).WithRuntimeClassName("nvidia"), + }, } for _, tc := range testCases {