Skip to content

Commit 33fb25b

Browse files
authored
Implement basic DRA support for NIMService (#521)
Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
1 parent 16ccde3 commit 33fb25b

File tree

16 files changed

+1148
-24
lines changed

16 files changed

+1148
-24
lines changed

api/apps/v1alpha1/common_types.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,34 @@ type PersistentVolumeClaim struct {
239239
// Annotations for the PVC
240240
Annotations map[string]string `json:"annotations,omitempty"`
241241
}
242+
243+
// DRAResource references exactly one ResourceClaim, either directly
244+
// or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim.
245+
//
246+
// When creating the NIMService pods, it adds a name (`DNS_LABEL` format) to it
247+
// that uniquely identifies the DRA resource.
248+
type DRAResource struct {
249+
// ResourceClaimName is the name of a ResourceClaim object in the same
250+
// namespace as the NIMService.
251+
//
252+
// Exactly one of ResourceClaimName and ResourceClaimTemplateName must
253+
// be set.
254+
ResourceClaimName *string `json:"resourceClaimName,omitempty"`
255+
256+
// ResourceClaimTemplateName is the name of a ResourceClaimTemplate
257+
// object in the same namespace as the pods for this NIMService.
258+
//
259+
// The template will be used to create a new ResourceClaim, which will
260+
// be bound to the pods created for this NIMService.
261+
//
262+
// Exactly one of ResourceClaimName and ResourceClaimTemplateName must
263+
// be set.
264+
ResourceClaimTemplateName *string `json:"resourceClaimTemplateName,omitempty"`
265+
266+
// Requests is the list of requests in the referenced ResourceClaim/ResourceClaimTemplate
267+
// to be made available to the model container of the NIMService pods.
268+
//
269+
// If empty, everything from the claim is made available, otherwise
270+
// only the result of this subset of requests.
271+
Requests []string `json:"requests,omitempty"`
272+
}

api/apps/v1alpha1/nimservice_types.go

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,20 +64,26 @@ type NIMServiceSpec struct {
6464
// The name of an existing pull secret containing the NGC_API_KEY
6565
AuthSecret string `json:"authSecret"`
6666
// Storage is the target storage for caching NIM model if NIMCache is not provided
67-
Storage NIMServiceStorage `json:"storage,omitempty"`
68-
Labels map[string]string `json:"labels,omitempty"`
69-
Annotations map[string]string `json:"annotations,omitempty"`
70-
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
71-
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
72-
PodAffinity *corev1.PodAffinity `json:"podAffinity,omitempty"`
73-
Resources *corev1.ResourceRequirements `json:"resources,omitempty"`
74-
Expose Expose `json:"expose,omitempty"`
75-
LivenessProbe Probe `json:"livenessProbe,omitempty"`
76-
ReadinessProbe Probe `json:"readinessProbe,omitempty"`
77-
StartupProbe Probe `json:"startupProbe,omitempty"`
78-
Scale Autoscaling `json:"scale,omitempty"`
79-
SchedulerName string `json:"schedulerName,omitempty"`
80-
Metrics Metrics `json:"metrics,omitempty"`
67+
Storage NIMServiceStorage `json:"storage,omitempty"`
68+
Labels map[string]string `json:"labels,omitempty"`
69+
Annotations map[string]string `json:"annotations,omitempty"`
70+
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
71+
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
72+
PodAffinity *corev1.PodAffinity `json:"podAffinity,omitempty"`
73+
// Resources is the resource requirements for the NIMService deployment.
74+
//
75+
// Note: Only traditional resources like cpu/memory and custom device plugin resources are supported here.
76+
// Any DRA claim references are ignored. Use DRAResources instead for those.
77+
Resources *corev1.ResourceRequirements `json:"resources,omitempty"`
78+
// DRAResources is the list of DRA resource claims to be used for the NIMService deployment.
79+
DRAResources []DRAResource `json:"draResources,omitempty"`
80+
Expose Expose `json:"expose,omitempty"`
81+
LivenessProbe Probe `json:"livenessProbe,omitempty"`
82+
ReadinessProbe Probe `json:"readinessProbe,omitempty"`
83+
StartupProbe Probe `json:"startupProbe,omitempty"`
84+
Scale Autoscaling `json:"scale,omitempty"`
85+
SchedulerName string `json:"schedulerName,omitempty"`
86+
Metrics Metrics `json:"metrics,omitempty"`
8187
// +kubebuilder:validation:Minimum=1
8288
// +kubebuilder:default:=1
8389
Replicas int `json:"replicas,omitempty"`
@@ -99,6 +105,10 @@ type NIMServiceStatus struct {
99105
AvailableReplicas int32 `json:"availableReplicas,omitempty"`
100106
State string `json:"state,omitempty"`
101107
Model *ModelStatus `json:"model,omitempty"`
108+
// DRAResourceStatuses is the status of the DRA resources.
109+
// +listType=map
110+
// +listMapKey=name
111+
DRAResourceStatuses []DRAResourceStatus `json:"draResourceStatuses,omitempty"`
102112
}
103113

104114
// ModelStatus defines the configuration of the NIMService model.
@@ -108,6 +118,26 @@ type ModelStatus struct {
108118
ExternalEndpoint string `json:"externalEndpoint"`
109119
}
110120

121+
// DRAResourceStatus defines the status of the DRAResource.
122+
type DRAResourceStatus struct {
123+
// Name is the generated name of the DRAResource referenced in the NIMService
124+
// pod template as `spec.resourceClaims[].name`.
125+
Name string `json:"name"`
126+
// ResourceClaimTemplateName is the name of the ResourceClaimTemplate that was
127+
// used to generate the ResourceClaim for an instance of NIMService.
128+
ResourceClaimTemplateName *string `json:"resourceClaimTemplateName,omitempty"`
129+
// ResourceClaims is the status of generated resource claims.
130+
//
131+
// This list is empty if ResourceClaimTemplateName is not set.
132+
ResourceClaims []DRAResourceClaimStatus `json:"resourceClaims,omitempty"`
133+
}
134+
135+
// DRAResourceClaimStatus defines the status of the DRAResourceClaim.
136+
type DRAResourceClaimStatus struct {
137+
// Name is the name of the ResourceClaim that was generated for a NIMService pod.
138+
Name string `json:"name"`
139+
}
140+
111141
// +genclient
112142
// +kubebuilder:object:root=true
113143
// +kubebuilder:subresource:status
@@ -364,7 +394,14 @@ func (n *NIMService) GetImagePullPolicy() string {
364394

365395
// GetResources returns resources to allocate to the NIMService container.
366396
func (n *NIMService) GetResources() *corev1.ResourceRequirements {
367-
return n.Spec.Resources
397+
if n.Spec.Resources == nil {
398+
return nil
399+
}
400+
401+
return &corev1.ResourceRequirements{
402+
Requests: n.Spec.Resources.Requests,
403+
Limits: n.Spec.Resources.Limits,
404+
}
368405
}
369406

370407
// IsProbeEnabled returns true if a given liveness/readiness/startup probe is enabled.

api/apps/v1alpha1/zz_generated.deepcopy.go

Lines changed: 84 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/apps.nvidia.com_nimpipelines.yaml

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,48 @@ spec:
101101
items:
102102
type: string
103103
type: array
104+
draResources:
105+
description: DRAResources is the list of DRA resource claims
106+
to be used for the NIMService deployment.
107+
items:
108+
description: |-
109+
DRAResource references exactly one ResourceClaim, either directly
110+
or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim.
111+
112+
When creating the NIMService pods, it adds a name (`DNS_LABEL` format) to it
113+
that uniquely identifies the DRA resource.
114+
properties:
115+
requests:
116+
description: |-
117+
Requests is the list of requests in the referenced ResourceClaim/ResourceClaimTemplate
118+
to be made available to the model container of the NIMService pods.
119+
120+
If empty, everything from the claim is made available, otherwise
121+
only the result of this subset of requests.
122+
items:
123+
type: string
124+
type: array
125+
resourceClaimName:
126+
description: |-
127+
ResourceClaimName is the name of a ResourceClaim object in the same
128+
namespace as the NIMService.
129+
130+
Exactly one of ResourceClaimName and ResourceClaimTemplateName must
131+
be set.
132+
type: string
133+
resourceClaimTemplateName:
134+
description: |-
135+
ResourceClaimTemplateName is the name of a ResourceClaimTemplate
136+
object in the same namespace as the pods for this NIMService.
137+
138+
The template will be used to create a new ResourceClaim, which will
139+
be bound to the pods created for this NIMService.
140+
141+
Exactly one of ResourceClaimName and ResourceClaimTemplateName must
142+
be set.
143+
type: string
144+
type: object
145+
type: array
104146
env:
105147
items:
106148
description: EnvVar represents an environment variable
@@ -1322,8 +1364,11 @@ spec:
13221364
minimum: 1
13231365
type: integer
13241366
resources:
1325-
description: ResourceRequirements describes the compute
1326-
resource requirements.
1367+
description: |-
1368+
Resources is the resource requirements for the NIMService deployment.
1369+
1370+
Note: Only traditional resources like cpu/memory and custom device plugin resources are supported here.
1371+
Any DRA claim references are ignored. Use DRAResources instead for those.
13271372
properties:
13281373
claims:
13291374
description: |-

0 commit comments

Comments
 (0)