Skip to content

Commit cb7d759

Browse files
authored
Merge pull request #2066 from NVIDIA/gfd-disable-nfapi-for-ocp
[gpu-feature-discovery] disable nodefeature API for openshift clusters
2 parents 1626bd1 + 63cb7ae commit cb7d759

File tree

2 files changed

+128
-0
lines changed

2 files changed

+128
-0
lines changed

controllers/object_controls.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,12 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
940940
obj.Spec.Template.Spec.Containers[0].Args = config.GPUFeatureDiscovery.Args
941941
}
942942

943+
// If we are on an OpenShift cluster, we disable the NodeFeature API as a node feature label source
944+
// We can remove this once OpenShift's NFD instances start supporting the NodeFeature API
945+
if len(n.openshift) > 0 {
946+
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "USE_NODE_FEATURE_API", "false")
947+
}
948+
943949
// set/append environment variables for exporter container
944950
if len(config.GPUFeatureDiscovery.Env) > 0 {
945951
for _, env := range config.GPUFeatureDiscovery.Env {

controllers/transforms_test.go

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3518,3 +3518,125 @@ func TestTransformDriverVGPUTopologyConfig(t *testing.T) {
35183518
removeDigestFromDaemonSet(ds.DaemonSet)
35193519
require.EqualValues(t, expectedDs, ds)
35203520
}
3521+
3522+
func TestTransformGPUDiscoveryPlugin(t *testing.T) {
3523+
node := &corev1.Node{
3524+
ObjectMeta: metav1.ObjectMeta{
3525+
Name: "test-node",
3526+
Labels: map[string]string{
3527+
nfdKernelLabelKey: "6.8.0-60-generic",
3528+
commonGPULabelKey: "true",
3529+
},
3530+
},
3531+
}
3532+
mockClient := fake.NewFakeClient(node)
3533+
ds := NewDaemonset().WithContainer(corev1.Container{Name: "gpu-feature-discovery"}).
3534+
WithInitContainer(corev1.Container{Name: "toolkit-validation"}).
3535+
WithInitContainer(corev1.Container{Name: "config-manager-init"})
3536+
cpSpec := &gpuv1.ClusterPolicySpec{
3537+
GPUFeatureDiscovery: gpuv1.GPUFeatureDiscoverySpec{
3538+
Repository: "nvcr.io/nvidia",
3539+
Image: "k8s-device-plugin",
3540+
Version: "v0.18.1",
3541+
},
3542+
Validator: gpuv1.ValidatorSpec{
3543+
Repository: "nvcr.io/nvidia/cloud-native",
3544+
Image: "gpu-operator-validator",
3545+
Version: "v1.0.0",
3546+
ImagePullPolicy: "IfNotPresent",
3547+
ImagePullSecrets: []string{"pull-secret"},
3548+
Toolkit: gpuv1.ToolkitValidatorSpec{
3549+
Env: []gpuv1.EnvVar{{Name: "foo", Value: "bar"}},
3550+
},
3551+
},
3552+
}
3553+
expectedDs := NewDaemonset().WithContainer(corev1.Container{
3554+
Name: "gpu-feature-discovery",
3555+
Image: "nvcr.io/nvidia/k8s-device-plugin:v0.18.1",
3556+
ImagePullPolicy: corev1.PullIfNotPresent,
3557+
Env: []corev1.EnvVar{
3558+
{
3559+
Name: "NVIDIA_MIG_MONITOR_DEVICES",
3560+
Value: "all",
3561+
},
3562+
},
3563+
}).WithInitContainer(corev1.Container{
3564+
Name: "toolkit-validation",
3565+
Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
3566+
ImagePullPolicy: corev1.PullIfNotPresent,
3567+
Env: []corev1.EnvVar{{Name: "foo", Value: "bar"}},
3568+
SecurityContext: &corev1.SecurityContext{
3569+
RunAsUser: rootUID,
3570+
},
3571+
}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia")
3572+
3573+
err := TransformGPUDiscoveryPlugin(ds.DaemonSet, cpSpec,
3574+
ClusterPolicyController{client: mockClient, runtime: gpuv1.Containerd,
3575+
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test")})
3576+
require.NoError(t, err)
3577+
removeDigestFromDaemonSet(ds.DaemonSet)
3578+
require.EqualValues(t, expectedDs, ds)
3579+
}
3580+
3581+
func TestTransformGPUDiscoveryPluginOCP(t *testing.T) {
3582+
node := &corev1.Node{
3583+
ObjectMeta: metav1.ObjectMeta{
3584+
Name: "test-ocp-node",
3585+
Labels: map[string]string{
3586+
nfdKernelLabelKey: "5.14.0-284.43.1.el9_2.x86_64",
3587+
commonGPULabelKey: "true",
3588+
},
3589+
},
3590+
}
3591+
mockClient := fake.NewFakeClient(node)
3592+
ds := NewDaemonset().WithContainer(corev1.Container{Name: "gpu-feature-discovery"}).
3593+
WithInitContainer(corev1.Container{Name: "toolkit-validation"}).
3594+
WithInitContainer(corev1.Container{Name: "config-manager-init"})
3595+
cpSpec := &gpuv1.ClusterPolicySpec{
3596+
GPUFeatureDiscovery: gpuv1.GPUFeatureDiscoverySpec{
3597+
Repository: "nvcr.io/nvidia",
3598+
Image: "k8s-device-plugin",
3599+
Version: "v0.18.1",
3600+
},
3601+
Validator: gpuv1.ValidatorSpec{
3602+
Repository: "nvcr.io/nvidia/cloud-native",
3603+
Image: "gpu-operator-validator",
3604+
Version: "v1.0.0",
3605+
ImagePullPolicy: "IfNotPresent",
3606+
ImagePullSecrets: []string{"pull-secret"},
3607+
Toolkit: gpuv1.ToolkitValidatorSpec{
3608+
Env: []gpuv1.EnvVar{{Name: "foo", Value: "bar"}},
3609+
},
3610+
},
3611+
}
3612+
expectedDs := NewDaemonset().WithContainer(corev1.Container{
3613+
Name: "gpu-feature-discovery",
3614+
Image: "nvcr.io/nvidia/k8s-device-plugin:v0.18.1",
3615+
ImagePullPolicy: corev1.PullIfNotPresent,
3616+
Env: []corev1.EnvVar{
3617+
{
3618+
Name: "USE_NODE_FEATURE_API",
3619+
Value: "false",
3620+
},
3621+
{
3622+
Name: "NVIDIA_MIG_MONITOR_DEVICES",
3623+
Value: "all",
3624+
},
3625+
},
3626+
}).WithInitContainer(corev1.Container{
3627+
Name: "toolkit-validation",
3628+
Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
3629+
ImagePullPolicy: corev1.PullIfNotPresent,
3630+
Env: []corev1.EnvVar{{Name: "foo", Value: "bar"}},
3631+
SecurityContext: &corev1.SecurityContext{
3632+
RunAsUser: rootUID,
3633+
},
3634+
}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia")
3635+
3636+
err := TransformGPUDiscoveryPlugin(ds.DaemonSet, cpSpec,
3637+
ClusterPolicyController{client: mockClient, runtime: gpuv1.Containerd,
3638+
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), openshift: "4.14"})
3639+
require.NoError(t, err)
3640+
removeDigestFromDaemonSet(ds.DaemonSet)
3641+
require.EqualValues(t, expectedDs, ds)
3642+
}

0 commit comments

Comments
 (0)