Skip to content

Commit 36872d3

Browse files
authored
adding binder operand (NVIDIA#486)
1 parent 54bde2c commit 36872d3

File tree

10 files changed

+814
-0
lines changed

10 files changed

+814
-0
lines changed

deployments/kai-scheduler/crds/kai.scheduler_configs.yaml

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,164 @@ spec:
172172
type: integer
173173
type: object
174174
type: object
175+
binder:
176+
description: Binder specifies configuration for the binder
177+
properties:
178+
maxConcurrentReconciles:
179+
description: MaxConcurrentReconciles is the maximum number of
180+
concurrent reconciles for both pods and BindRequests
181+
type: integer
182+
metricsPort:
183+
description: MetricsPort specifies the metrics service port
184+
type: integer
185+
probePort:
186+
description: ProbePort specifies the health check port
187+
type: integer
188+
replicas:
189+
description: Replicas specifies the number of replicas of the
190+
KAI binder service
191+
format: int32
192+
type: integer
193+
resourceReservation:
194+
description: ResourceReservation controls configuration for the
195+
resource reservation functionality
196+
properties:
197+
allocationTimeout:
198+
description: AllocationTimeout specifies the timeout for resource
199+
reservation pod allocation in seconds
200+
type: integer
201+
appLabel:
202+
description: AppLabel is the value that will be set for all
203+
resource reservation pods to the label `app`
204+
type: string
205+
image:
206+
description: Image is the image used by the resource reservation
207+
pods
208+
properties:
209+
name:
210+
description: Name is the name of the image
211+
type: string
212+
pullPolicy:
213+
description: PullPolicy is the pull policy of the image
214+
type: string
215+
repository:
216+
description: Repository is the repository/registry prefix
217+
for the image
218+
type: string
219+
tag:
220+
description: Tag is the tag of the image
221+
type: string
222+
type: object
223+
namespace:
224+
description: Namespace is the name of the namespace where
225+
the resource reservation pods will run
226+
type: string
227+
serviceAccountName:
228+
description: ServiceAccountName is the name of the service
229+
account that will be used by the resource reservation pods
230+
type: string
231+
type: object
232+
service:
233+
properties:
234+
enabled:
235+
description: Enabled defines whether the service should be
236+
deployed
237+
type: boolean
238+
image:
239+
description: Image is the configuration of the service image
240+
properties:
241+
name:
242+
description: Name is the name of the image
243+
type: string
244+
pullPolicy:
245+
description: PullPolicy is the pull policy of the image
246+
type: string
247+
repository:
248+
description: Repository is the repository/registry prefix
249+
for the image
250+
type: string
251+
tag:
252+
description: Tag is the tag of the image
253+
type: string
254+
type: object
255+
k8sClientConfig:
256+
description: ClientConfig specifies the configuration of k8s
257+
client
258+
properties:
259+
burst:
260+
description: Burst specifies the burst rate for the k8s
261+
client
262+
type: integer
263+
qps:
264+
description: QPS specifies the QPS rate for the k8s client
265+
type: integer
266+
type: object
267+
resources:
268+
description: Resources describes the resource requirements
269+
for the service pods
270+
properties:
271+
claims:
272+
description: |-
273+
Claims lists the names of resources, defined in spec.resourceClaims,
274+
that are used by this container.
275+
276+
This is an alpha field and requires enabling the
277+
DynamicResourceAllocation feature gate.
278+
279+
This field is immutable. It can only be set for containers.
280+
items:
281+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
282+
properties:
283+
name:
284+
description: |-
285+
Name must match the name of one entry in pod.spec.resourceClaims of
286+
the Pod where this field is used. It makes that resource available
287+
inside a container.
288+
type: string
289+
request:
290+
description: |-
291+
Request is the name chosen for a request in the referenced claim.
292+
If empty, everything from the claim is made available, otherwise
293+
only the result of this request.
294+
type: string
295+
required:
296+
- name
297+
type: object
298+
type: array
299+
x-kubernetes-list-map-keys:
300+
- name
301+
x-kubernetes-list-type: map
302+
limits:
303+
additionalProperties:
304+
anyOf:
305+
- type: integer
306+
- type: string
307+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
308+
x-kubernetes-int-or-string: true
309+
description: |-
310+
Limits describes the maximum amount of compute resources allowed.
311+
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
312+
type: object
313+
requests:
314+
additionalProperties:
315+
anyOf:
316+
- type: integer
317+
- type: string
318+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
319+
x-kubernetes-int-or-string: true
320+
description: |-
321+
Requests describes the minimum amount of compute resources required.
322+
If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
323+
otherwise to an implementation-defined value. Requests cannot exceed Limits.
324+
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
325+
type: object
326+
type: object
327+
type: object
328+
volumeBindingTimeoutSeconds:
329+
description: VolumeBindingTimeoutSeconds specifies the timeout
330+
for volume binding in seconds
331+
type: integer
332+
type: object
175333
global:
176334
description: Global defined global configuration of the system
177335
properties:

pkg/apis/kai/v1/binder/binder.go

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// Copyright 2025 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// +kubebuilder:object:generate:=true
5+
package binder
6+
7+
import (
8+
"k8s.io/utils/ptr"
9+
10+
"github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/common"
11+
"github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
12+
v1 "k8s.io/api/core/v1"
13+
"k8s.io/apimachinery/pkg/api/resource"
14+
)
15+
16+
const (
17+
imageName = "binder"
18+
defaultResourceReservationImageName = "resource-reservation"
19+
)
20+
21+
type Binder struct {
22+
Service *common.Service `json:"service,omitempty"`
23+
24+
// ResourceReservation controls configuration for the resource reservation functionality
25+
// +kubebuilder:validation:Optional
26+
ResourceReservation *ResourceReservation `json:"resourceReservation,omitempty"`
27+
28+
// Replicas specifies the number of replicas of the KAI binder service
29+
// +kubebuilder:validation:Optional
30+
Replicas *int32 `json:"replicas,omitempty"`
31+
32+
// MaxConcurrentReconciles is the maximum number of concurrent reconciles for both pods and BindRequests
33+
// +kubebuilder:validation:Optional
34+
MaxConcurrentReconciles *int `json:"maxConcurrentReconciles,omitempty"`
35+
36+
// VolumeBindingTimeoutSeconds specifies the timeout for volume binding in seconds
37+
// +kubebuilder:validation:Optional
38+
VolumeBindingTimeoutSeconds *int `json:"volumeBindingTimeoutSeconds,omitempty"`
39+
40+
// ProbePort specifies the health check port
41+
ProbePort *int `json:"probePort,omitempty"`
42+
43+
// MetricsPort specifies the metrics service port
44+
MetricsPort *int `json:"metricsPort,omitempty"`
45+
}
46+
47+
func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32) {
48+
b.Service = common.SetDefault(b.Service, &common.Service{})
49+
b.Service.Enabled = common.SetDefault(b.Service.Enabled, ptr.To(false))
50+
b.Service.SetDefaultsWhereNeeded(imageName)
51+
52+
if _, found := b.Service.Resources.Requests[v1.ResourceCPU]; !found {
53+
b.Service.Resources.Requests[v1.ResourceCPU] = resource.MustParse("50m")
54+
}
55+
if _, found := b.Service.Resources.Requests[v1.ResourceMemory]; !found {
56+
b.Service.Resources.Requests[v1.ResourceMemory] = resource.MustParse("200Mi")
57+
}
58+
if _, found := b.Service.Resources.Limits[v1.ResourceCPU]; !found {
59+
b.Service.Resources.Limits[v1.ResourceCPU] = resource.MustParse("100m")
60+
}
61+
if _, found := b.Service.Resources.Limits[v1.ResourceMemory]; !found {
62+
b.Service.Resources.Limits[v1.ResourceMemory] = resource.MustParse("200Mi")
63+
}
64+
65+
b.Replicas = common.SetDefault(b.Replicas, ptr.To(ptr.Deref(replicaCount, 1)))
66+
67+
b.ResourceReservation = common.SetDefault(b.ResourceReservation, &ResourceReservation{})
68+
b.ResourceReservation.SetDefaultsWhereNeeded()
69+
70+
b.ProbePort = common.SetDefault(b.ProbePort, ptr.To(8081))
71+
b.MetricsPort = common.SetDefault(b.MetricsPort, ptr.To(8080))
72+
73+
}
74+
75+
type ResourceReservation struct {
76+
// Image is the image used by the resource reservation pods
77+
// +kubebuilder:validation:Optional
78+
Image *common.Image `json:"image,omitempty"`
79+
80+
// AllocationTimeout specifies the timeout for resource reservation pod allocation in seconds
81+
// +kubebuilder:validation:Optional
82+
AllocationTimeout *int `json:"allocationTimeout,omitempty"`
83+
84+
// Namespace is the name of the namespace where the resource reservation pods will run
85+
// +kubebuilder:validation:Optional
86+
Namespace *string `json:"namespace,omitempty"`
87+
88+
// ServiceAccountName is the name of the service account that will be used by the resource reservation pods
89+
// +kubebuilder:validation:Optional
90+
ServiceAccountName *string `json:"serviceAccountName,omitempty"`
91+
92+
// AppLabel is the value that will be set for all resource reservation pods to the label `app`
93+
// +kubebuilder:validation:Optional
94+
AppLabel *string `json:"appLabel,omitempty"`
95+
}
96+
97+
func (r *ResourceReservation) SetDefaultsWhereNeeded() {
98+
r.Image = common.SetDefault(r.Image, &common.Image{})
99+
r.Image.Name = common.SetDefault(r.Image.Name, ptr.To(defaultResourceReservationImageName))
100+
r.Image.SetDefaultsWhereNeeded()
101+
102+
r.Namespace = common.SetDefault(r.Namespace, ptr.To(constants.DefaultResourceReservationName))
103+
r.ServiceAccountName = common.SetDefault(r.ServiceAccountName, ptr.To(constants.DefaultResourceReservationName))
104+
r.AppLabel = common.SetDefault(r.AppLabel, ptr.To(constants.DefaultResourceReservationName))
105+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// Copyright 2025 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package binder
5+
6+
import (
7+
"context"
8+
"testing"
9+
10+
. "github.com/onsi/ginkgo/v2"
11+
. "github.com/onsi/gomega"
12+
)
13+
14+
func TestBinder(t *testing.T) {
15+
RegisterFailHandler(Fail)
16+
RunSpecs(t, "Binder type suite")
17+
}
18+
19+
var _ = Describe("Binder", func() {
20+
It("Set Defaults", func(ctx context.Context) {
21+
binder := &Binder{}
22+
binder.SetDefaultsWhereNeeded(nil)
23+
Expect(*binder.Service.Enabled).To(Equal(false))
24+
Expect(*binder.Service.Image.Name).To(Equal("binder"))
25+
})
26+
It("Set Defaults With Replica Count", func(ctx context.Context) {
27+
binder := &Binder{}
28+
var replicaCount int32
29+
replicaCount = 3
30+
binder.SetDefaultsWhereNeeded(&replicaCount)
31+
Expect(*binder.Replicas).To(Equal(int32(3)))
32+
})
33+
})

0 commit comments

Comments
 (0)