Skip to content

Commit ee5bee0

Browse files
authored
deploy operator in chart with pod group controller (NVIDIA#467)
* deploy kai operator with pg controller operand * disable operator admission and qcontroller operands
1 parent 26655be commit ee5bee0

File tree

12 files changed

+188
-125
lines changed

12 files changed

+188
-125
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright 2025 NVIDIA CORPORATION
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: kai.scheduler/v1
5+
kind: Config
6+
metadata:
7+
name: kai-config
8+
namespace: {{ .Release.Namespace }}
9+
spec:
10+
namespace: {{ .Release.Namespace }}
11+
global:
12+
schedulerName: "kai-scheduler"
13+
queueLabelKey: "kai.scheduler/queue"
14+
nodePoolLabelKey: "kai.scheduler/nodepool"
15+
{{- if .Values.global.namespaceLabelSelector }}
16+
namespaceLabelSelector:
17+
{{- toYaml .Values.global.namespaceLabelSelector | nindent 6 }}
18+
{{- end }}
19+
{{- if .Values.global.podLabelSelector }}
20+
podLabelSelector:
21+
{{- toYaml .Values.global.podLabelSelector | nindent 6 }}
22+
{{- end }}
23+
{{- if .Values.global.affinity }}
24+
affinity:
25+
{{- toYaml .Values.global.affinity | nindent 6 }}
26+
{{- end }}
27+
{{- if .Values.global.tolerations }}
28+
tolerations:
29+
{{- toYaml .Values.global.tolerations | nindent 6 }}
30+
{{- end }}
31+
{{- if .Values.global.securityContext }}
32+
securityContext:
33+
{{- toYaml .Values.global.securityContext | nindent 6 }}
34+
{{- end }}
35+
{{- if .Values.global.imagePullSecrets }}
36+
imagesPullSecret: {{ index .Values.global.imagePullSecrets 0 | default "" }}
37+
{{- end }}
38+
replicaCount: {{ .Values.operator.replicaCount | default 1 }}
39+
40+
podGroupController:
41+
service:
42+
enabled: true
43+
image:
44+
name: {{ .Values.podgroupcontroller.image.name }}
45+
repository: {{ .Values.global.registry }}
46+
tag: {{ .Chart.Version }}
47+
pullPolicy: {{ .Values.podgroupcontroller.image.pullPolicy | default .Values.global.imagePullPolicy }}
48+
{{- if .Values.podgroupcontroller.resources }}
49+
resources:
50+
{{- toYaml .Values.podgroupcontroller.resources | nindent 6 }}
51+
{{- end }}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright 2025 NVIDIA CORPORATION
2+
# SPDX-License-Identifier: Apache-2.0
3+
---
4+
apiVersion: rbac.authorization.k8s.io/v1
5+
kind: ClusterRoleBinding
6+
metadata:
7+
name: kai-operator
8+
subjects:
9+
- kind: ServiceAccount
10+
name: kai-operator
11+
namespace: {{ .Release.Namespace }}
12+
roleRef:
13+
kind: ClusterRole
14+
name: kai-operator
15+
apiGroup: rbac.authorization.k8s.io
16+
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,7 @@
44
apiVersion: v1
55
kind: ServiceAccount
66
metadata:
7-
name: podgroup-controller
7+
name: kai-operator
8+
namespace: {{ .Release.Namespace }}
9+
labels:
10+
app: kai-operator
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Copyright 2025 NVIDIA CORPORATION
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: apps/v1
5+
kind: Deployment
6+
metadata:
7+
name: kai-operator
8+
namespace: {{ .Release.Namespace }}
9+
labels:
10+
app: kai-operator
11+
spec:
12+
replicas: {{ .Values.operator.replicaCount }}
13+
selector:
14+
matchLabels:
15+
app: kai-operator
16+
template:
17+
metadata:
18+
labels:
19+
app: kai-operator
20+
spec:
21+
serviceAccountName: kai-operator
22+
containers:
23+
- name: operator
24+
image: {{ .Values.global.registry }}/{{ .Values.operator.image.name }}:{{ .Chart.Version }}
25+
imagePullPolicy: {{ .Values.operator.image.pullPolicy }}
26+
resources:
27+
{{- toYaml .Values.operator.resources | nindent 12 }}
28+
args:
29+
- --metrics-bind-address={{ .Values.operator.metricsBindAddress | default ":8080" }}
30+
- --health-probe-bind-address={{ .Values.operator.probeBindAddress | default ":8081" }}
31+
- --leader-elect={{ .Values.global.leaderElection | default false }}
32+
- --qps={{ .Values.operator.qps | default 50 }}
33+
- --burst={{ .Values.operator.burst | default 300 }}
34+
- --namespace={{ .Release.Namespace }}
35+
env:
36+
- name: WATCH_NAMESPACE
37+
value: {{ .Release.Namespace }}
38+
- name: POD_NAME
39+
valueFrom:
40+
fieldRef:
41+
fieldPath: metadata.name
42+
- name: OPERATOR_NAME
43+
value: "kai-operator"
44+
- name: MS_REPOSITORY
45+
value: {{ .Values.global.registry }}
46+
- name: MS_TAG
47+
value: {{ .Chart.Version }}
48+
ports:
49+
- containerPort: 8080
50+
name: metrics
51+
- containerPort: 8081
52+
name: probe
53+
livenessProbe:
54+
httpGet:
55+
path: /healthz
56+
port: 8081
57+
initialDelaySeconds: 15
58+
periodSeconds: 20
59+
readinessProbe:
60+
httpGet:
61+
path: /readyz
62+
port: 8081
63+
initialDelaySeconds: 5
64+
periodSeconds: 10
65+
{{- if .Values.global.imagePullSecrets }}
66+
imagePullSecrets:
67+
{{- toYaml .Values.global.imagePullSecrets | nindent 8 }}
68+
{{- end }}
69+
{{- if .Values.global.affinity }}
70+
affinity:
71+
{{- toYaml .Values.global.affinity | nindent 8 }}
72+
{{- end }}
73+
{{- if .Values.global.tolerations }}
74+
tolerations:
75+
{{- toYaml .Values.global.tolerations | nindent 8 }}
76+
{{- end }}
77+
{{- if .Values.global.nodeSelector }}
78+
nodeSelector:
79+
{{- toYaml .Values.global.nodeSelector | nindent 8 }}
80+
{{- end }}

deployments/kai-scheduler/templates/services/podgroup-controller.yaml

Lines changed: 0 additions & 51 deletions
This file was deleted.

deployments/kai-scheduler/values.yaml

Lines changed: 31 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
global:
55
registry: registry/local/kai-scheduler
6+
tag: latest
7+
imagePullPolicy: IfNotPresent
68
securityContext: {}
79
imagePullSecrets: []
810
leaderElection: false
@@ -18,33 +20,27 @@ global:
1820
serviceAccount: kai-resource-reservation
1921
appLabel: kai-resource-reservation
2022

23+
operator:
24+
image:
25+
name: operator
26+
pullPolicy: IfNotPresent
27+
replicaCount: 1
28+
metricsBindAddress: ":8080"
29+
probeBindAddress: ":8081"
30+
qps: 50
31+
burst: 300
2132

2233
podgrouper:
2334
image:
2435
name: podgrouper
2536
pullPolicy: IfNotPresent
2637
additionalArgs: []
27-
resources:
28-
limits:
29-
cpu: "500m"
30-
memory: "256Mi"
31-
requests:
32-
cpu: "250m"
33-
memory: "128Mi"
3438
queueLabelKey: "kai.scheduler/queue"
3539

3640
podgroupcontroller:
3741
image:
3842
name: podgroupcontroller
3943
pullPolicy: IfNotPresent
40-
additionalArgs: []
41-
resources:
42-
limits:
43-
cpu: "500m"
44-
memory: "256Mi"
45-
requests:
46-
cpu: "250m"
47-
memory: "128Mi"
4844

4945
binder:
5046
name: binder
@@ -56,14 +52,6 @@ binder:
5652
additionalArgs: []
5753
ports:
5854
metricsPort: 8080
59-
probePort: 8081
60-
resources:
61-
limits:
62-
cpu: "500m"
63-
memory: "256Mi"
64-
requests:
65-
cpu: "250m"
66-
memory: "128Mi"
6755
cdi: false
6856

6957
scheduler:
@@ -74,59 +62,13 @@ scheduler:
7462
placementStrategy: binpack
7563
ports:
7664
metricsPort: 8080
77-
resources:
78-
limits:
79-
cpu: "500m"
80-
memory: "256Mi"
81-
requests:
82-
cpu: "250m"
83-
memory: "128Mi"
84-
85-
webhookmanager:
86-
image:
87-
name: webhookmanager
88-
pullPolicy: IfNotPresent
89-
resources:
90-
limits:
91-
cpu: "500m"
92-
memory: "256Mi"
93-
requests:
94-
cpu: "250m"
95-
memory: "128Mi"
96-
97-
nodescaleadjuster:
98-
image:
99-
name: nodescaleadjuster
100-
pullPolicy: IfNotPresent
101-
scalingPodImage:
102-
name: scalingpod
103-
additionalArgs: []
104-
resources:
105-
limits:
106-
cpu: "500m"
107-
memory: "256Mi"
108-
requests:
109-
cpu: "250m"
110-
memory: "128Mi"
11165

11266
queuecontroller:
11367
image:
11468
name: queuecontroller
11569
pullPolicy: IfNotPresent
11670
additionalArgs: []
11771
certSecretName: queuecontroller-webhook-tls-secret
118-
resources:
119-
limits:
120-
cpu: "500m"
121-
memory: "256Mi"
122-
requests:
123-
cpu: "250m"
124-
memory: "128Mi"
125-
126-
crdupgrader:
127-
image:
128-
name: crd-upgrader
129-
pullPolicy: IfNotPresent
13072

13173
admission:
13274
name: kai-admission
@@ -139,10 +81,30 @@ admission:
13981
webhookPort: 9443
14082
metricsPort: 8080
14183
probePort: 8081
84+
cdi: false
85+
86+
nodescaleadjuster:
87+
image:
88+
name: nodescaleadjuster
89+
pullPolicy: IfNotPresent
90+
scalingPodImage:
91+
name: scalingpod
92+
additionalArgs: []
93+
94+
crdupgrader:
95+
image:
96+
name: crd-upgrader
97+
pullPolicy: IfNotPresent
98+
99+
webhookmanager:
100+
image:
101+
name: webhookmanager
102+
pullPolicy: IfNotPresent
142103
resources:
143104
limits:
144105
cpu: "500m"
145106
memory: "256Mi"
146107
requests:
147108
cpu: "250m"
148109
memory: "128Mi"
110+

pkg/apis/kai/v1/admission/admission.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ type Admission struct {
3636

3737
func (b *Admission) SetDefaultsWhereNeeded(replicaCount *int32) {
3838
b.Service = common.SetDefault(b.Service, &common.Service{})
39+
b.Service.Enabled = common.SetDefault(b.Service.Enabled, ptr.To(false))
3940
b.Service.SetDefaultsWhereNeeded(imageName)
4041

4142
b.Service.Resources = common.SetDefault(b.Service.Resources, &common.Resources{})

pkg/apis/kai/v1/admission/admission_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ var _ = Describe("Admission", func() {
2222
var replicaCount int32
2323
replicaCount = 1
2424
Admission.SetDefaultsWhereNeeded(&replicaCount)
25-
Expect(*Admission.Service.Enabled).To(Equal(true))
25+
Expect(*Admission.Service.Enabled).To(Equal(false))
2626
Expect(*Admission.Service.Image.Name).To(Equal("admission"))
2727
Expect(*Admission.Replicas).To(Equal(int32(1)))
2828
})

pkg/apis/kai/v1/pod_group_controller/pod_group_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import (
1212
)
1313

1414
const (
15-
imageName = "pod-group-controller"
15+
imageName = "podgroupcontroller"
1616
)
1717

1818
type PodGroupController struct {

0 commit comments

Comments
 (0)