Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions assets/state-device-plugin/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,32 @@ spec:
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: HostToContainer
# The cdi-validation init container will only be added when CDI is enabled in the operator.
# This init container ensures that the device-plugin does not start until a version of
# the toolkit container that has CDI enabled is running. This is implemented to prevent
# a possible race condition where a newer version of the device-plugin, with CDI enabled,
# starts running while an older version of the toolkit, which does not have CDI enabled,
# is still running.
- image: "FILLED BY THE OPERATOR"
name: cdi-validation
command: [ 'sh', '-c' ]
args:
- |
if [ -z "${NVIDIA_CTK_LIBCUDA_DIR}" ]; then
echo "waiting for NVIDIA Container Toolkit to be installed with CDI enabled"
echo "will sleep and then exit"
sleep 5
exit 1
fi
env:
- name: NVIDIA_VISIBLE_DEVICES
value: all
securityContext:
privileged: true
volumeMounts:
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: HostToContainer
- image: "FILLED BY THE OPERATOR"
name: config-manager-init
command: ["config-manager"]
Expand Down
9 changes: 9 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,15 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
// update env required for CDI support
if config.CDI.IsEnabled() {
transformDevicePluginCtrForCDI(devicePluginMainContainer, config)
} else {
// remove the "cdi-validation" init container when CDI is not enabled
cdiValidationContainerName := "cdi-validation"
for i, container := range obj.Spec.Template.Spec.InitContainers {
if container.Name != cdiValidationContainerName {
continue
}
obj.Spec.Template.Spec.InitContainers = append(obj.Spec.Template.Spec.InitContainers[:i], obj.Spec.Template.Spec.InitContainers[i+1:]...)
}
}

// update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured
Expand Down
30 changes: 30 additions & 0 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,36 @@ func TestTransformDevicePlugin(t *testing.T) {
},
}).WithRuntimeClassName("nvidia"),
},
{
description: "transform device plugin, CDI is disabled",
ds: NewDaemonset().
WithInitContainer(corev1.Container{Name: "dummy"}).
WithInitContainer(corev1.Container{Name: "cdi-validation"}).
WithContainer(corev1.Container{Name: "nvidia-device-plugin"}),
cpSpec: &gpuv1.ClusterPolicySpec{
DevicePlugin: gpuv1.DevicePluginSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "nvidia-device-plugin",
Version: "v1.0.0",
},
CDI: gpuv1.CDIConfigSpec{
Enabled: newBoolPtr(false),
},
Validator: gpuv1.ValidatorSpec{
Repository: "nvcr.io/nvidia",
Image: "validator",
Version: "v1.0.0",
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "nvidia-device-plugin",
Image: "nvcr.io/nvidia/cloud-native/nvidia-device-plugin:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{Name: "NVIDIA_MIG_MONITOR_DEVICES", Value: "all"},
},
}).WithInitContainer(corev1.Container{Name: "dummy"}).WithRuntimeClassName("nvidia"),
},
}

for _, tc := range testCases {
Expand Down