Skip to content

Commit 5c3065f

Browse files
authored
fix(binder): allow explicit CDI conf override (#951)
1 parent 0018270 commit 5c3065f

File tree

8 files changed

+60
-4
lines changed

8 files changed

+60
-4
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
66

77
## [Unreleased]
88

9+
## [v0.12.11] - 2026-02-03
10+
### Fixed
11+
- Added `binder.cdiEnabled` Helm value to allow explicit override of CDI auto-detection for environments without ClusterPolicy fixing compatibility issues in Openshift
12+
913
## [v0.12.9] - 2026-01-21
1014

1115
### Fixed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ When `gpu-operator` <v25.10.0 is installed, the following flag should be added t
6767
```
6868
--set admission.gpuPodRuntimeClassName=null
6969
```
70+
If CDI is enabled also add `--set binder.cdiEnabled=true`.
7071

7172
## Support & Breaking changes
7273
For details on our release lifecycle, LTS versions, and supported releases, see the [Support Policy](SUPPORT.md).

deployments/kai-scheduler/crds/kai.scheduler_configs.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,6 +1116,11 @@ spec:
11161116
binder:
11171117
description: Binder specifies configuration for the binder
11181118
properties:
1119+
cdiEnabled:
1120+
description: |-
1121+
CDIEnabled Specifies if the gpu device plugin uses the cdi devices api to set gpu devices to the pods
1122+
leave empty if unsure to let the operator auto detect using ClusterPolicy (nvidia gpu-operator only)
1123+
type: boolean
11191124
maxConcurrentReconciles:
11201125
description: MaxConcurrentReconciles is the maximum number of
11211126
concurrent reconciles for both pods and BindRequests

deployments/kai-scheduler/templates/kai-config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ spec:
6767
podResources:
6868
{{- toYaml .Values.binder.resourceReservationPodResources | nindent 8 }}
6969
{{- end }}
70+
{{- if hasKey .Values.binder "cdiEnabled" }}
71+
cdiEnabled: {{ .Values.binder.cdiEnabled }}
72+
{{- end }}
7073

7174
podGroupController:
7275
service:

pkg/apis/kai/v1/binder/binder.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ type Binder struct {
4242

4343
// MetricsPort specifies the metrics service port
4444
MetricsPort *int `json:"metricsPort,omitempty"`
45+
46+
// CDIEnabled Specifies if the gpu device plugin uses the cdi devices api to set gpu devices to the pods
47+
// leave empty if unsure to let the operator auto detect using ClusterPolicy (nvidia gpu-operator only)
48+
// +kubebuilder:validation:Optional
49+
CDIEnabled *bool `json:"cdiEnabled,omitempty"`
4550
}
4651

4752
func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32) {
@@ -76,7 +81,6 @@ func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32) {
7681

7782
b.ProbePort = common.SetDefault(b.ProbePort, ptr.To(8081))
7883
b.MetricsPort = common.SetDefault(b.MetricsPort, ptr.To(8080))
79-
8084
}
8185

8286
type ResourceReservation struct {

pkg/apis/kai/v1/binder/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/operator/operands/binder/binder_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,35 @@ var _ = Describe("Binder", func() {
164164
Expect(deploymentT).NotTo(BeNil())
165165
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=false"))
166166
})
167+
168+
It("uses explicit CDIEnabled=true from config, ignoring ClusterPolicy", func(ctx context.Context) {
169+
clusterPolicy.Spec.CDI.Default = ptr.To(false)
170+
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
171+
172+
kaiConfig.Spec.Binder.CDIEnabled = ptr.To(true)
173+
174+
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
175+
Expect(err).To(BeNil())
176+
177+
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
178+
Expect(deploymentT).NotTo(BeNil())
179+
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
180+
})
181+
182+
It("uses explicit CDIEnabled=false from config, ignoring ClusterPolicy", func(ctx context.Context) {
183+
clusterPolicy.Spec.CDI.Enabled = ptr.To(true)
184+
clusterPolicy.Spec.CDI.Default = ptr.To(true)
185+
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
186+
187+
kaiConfig.Spec.Binder.CDIEnabled = ptr.To(false)
188+
189+
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
190+
Expect(err).To(BeNil())
191+
192+
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
193+
Expect(deploymentT).NotTo(BeNil())
194+
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=false"))
195+
})
167196
})
168197
})
169198

pkg/operator/operands/binder/resources.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,14 @@ func (b *Binder) deploymentForKAIConfig(
4848
return nil, err
4949
}
5050

51-
cdiEnabled, err := isCdiEnabled(ctx, runtimeClient)
52-
if err != nil {
53-
return nil, err
51+
var cdiEnabled bool
52+
if config.CDIEnabled != nil {
53+
cdiEnabled = *config.CDIEnabled
54+
} else {
55+
cdiEnabled, err = isCdiEnabled(ctx, runtimeClient)
56+
if err != nil {
57+
return nil, err
58+
}
5459
}
5560

5661
deployment.Spec.Strategy.Type = appsv1.RecreateDeploymentStrategyType

0 commit comments

Comments
 (0)