Skip to content

Commit 0860b58

Browse files
committed
rework DRA apis
Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
1 parent 38e3dde commit 0860b58

File tree

12 files changed

+517
-315
lines changed

12 files changed

+517
-315
lines changed

api/apps/v1alpha1/common_types.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,43 @@ type PersistentVolumeClaim struct {
239239
// Annotations for the PVC
240240
Annotations map[string]string `json:"annotations,omitempty"`
241241
}
242+
243+
// DRAResource references exactly one ResourceClaim, either directly
244+
// or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim.
245+
//
246+
// It adds a name to it that uniquely identifies the ResourceClaim.
247+
// NIMService containers that need access to the ResourceClaim will automatically reference it with this name.
248+
type DRAResource struct {
249+
// Name uniquely identifies this resource claim.
250+
// This must be a DNS_LABEL.
251+
Name string `json:"name"`
252+
253+
// ResourceClaimName is the name of a ResourceClaim object in the same
254+
// namespace as the NIMService.
255+
//
256+
// Exactly one of ResourceClaimName and ResourceClaimTemplateName must
257+
// be set.
258+
ResourceClaimName *string `json:"resourceClaimName,omitempty"`
259+
260+
// ResourceClaimTemplateName is the name of a ResourceClaimTemplate
261+
// object in the same namespace as the pods for this NIMService.
262+
//
263+
// The template will be used to create a new ResourceClaim, which will
264+
// be bound to the pods created for this NIMService. When the pod is deleted,
265+
// the ResourceClaim will also be deleted. The pod name and resource name, along
266+
// with a generated component, will be used to form a unique name for the
267+
// ResourceClaim, which will be recorded in pod.status.resourceClaimStatuses.
268+
//
269+
// Modifying this field will result in the NIMService going to Failed state.
270+
//
271+
// Exactly one of ResourceClaimName and ResourceClaimTemplateName must
272+
// be set.
273+
ResourceClaimTemplateName *string `json:"resourceClaimTemplateName,omitempty"`
274+
275+
// Requests is the list of requests in the referenced ResourceClaim/ResourceClaimTemplate
276+
// to be made available to the model container of the NIMService pods.
277+
//
278+
// If empty, everything from the claim is made available, otherwise
279+
// only the result of this subset of requests.
280+
Requests []string `json:"requests,omitempty"`
281+
}

api/apps/v1alpha1/nimservice_types.go

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,18 @@ type NIMServiceSpec struct {
6464
// The name of an existing pull secret containing the NGC_API_KEY
6565
AuthSecret string `json:"authSecret"`
6666
// Storage is the target storage for caching NIM model if NIMCache is not provided
67-
Storage NIMServiceStorage `json:"storage,omitempty"`
68-
Labels map[string]string `json:"labels,omitempty"`
69-
Annotations map[string]string `json:"annotations,omitempty"`
70-
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
71-
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
72-
PodAffinity *corev1.PodAffinity `json:"podAffinity,omitempty"`
67+
Storage NIMServiceStorage `json:"storage,omitempty"`
68+
Labels map[string]string `json:"labels,omitempty"`
69+
Annotations map[string]string `json:"annotations,omitempty"`
70+
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
71+
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
72+
PodAffinity *corev1.PodAffinity `json:"podAffinity,omitempty"`
73+
// Resources is the resource requirements for the NIMService deployment.
74+
//
75+
// Note: Only traditional resources like cpu/memory and custom device plugin resources are supported here.
76+
// Any DRA claim references are ignored. Use DRAResources instead for those.
7377
Resources *corev1.ResourceRequirements `json:"resources,omitempty"`
78+
DRAResources []DRAResource `json:"draResources,omitempty"`
7479
Expose Expose `json:"expose,omitempty"`
7580
LivenessProbe Probe `json:"livenessProbe,omitempty"`
7681
ReadinessProbe Probe `json:"readinessProbe,omitempty"`
@@ -85,8 +90,6 @@ type NIMServiceSpec struct {
8590
GroupID *int64 `json:"groupID,omitempty"`
8691
RuntimeClassName string `json:"runtimeClassName,omitempty"`
8792
Proxy *ProxySpec `json:"proxy,omitempty"`
88-
89-
ResourceClaims []corev1.PodResourceClaim `json:"resourceClaims,omitempty"`
9093
}
9194

9295
// NIMCacheVolSpec defines the spec to use NIMCache volume.
@@ -366,7 +369,14 @@ func (n *NIMService) GetImagePullPolicy() string {
366369

367370
// GetResources returns resources to allocate to the NIMService container.
368371
func (n *NIMService) GetResources() *corev1.ResourceRequirements {
369-
return n.Spec.Resources
372+
if n.Spec.Resources == nil {
373+
return nil
374+
}
375+
376+
return &corev1.ResourceRequirements{
377+
Requests: n.Spec.Resources.Requests,
378+
Limits: n.Spec.Resources.Limits,
379+
}
370380
}
371381

372382
// IsProbeEnabled returns true if a given liveness/readiness/startup probe is enabled.
@@ -722,7 +732,7 @@ func (n *NIMService) GetDeploymentParams() *rendertypes.DeploymentParams {
722732
})
723733
}
724734

725-
params.PodResourceClaims = n.GetResourceClaims()
735+
params.PodResourceClaims = n.GetPodResourceClaims()
726736
return params
727737
}
728738

@@ -998,8 +1008,16 @@ func (n *NIMService) GetProxySpec() *ProxySpec {
9981008
return n.Spec.Proxy
9991009
}
10001010

1001-
func (n *NIMService) GetResourceClaims() []corev1.PodResourceClaim {
1002-
return n.Spec.ResourceClaims
1011+
func (n *NIMService) GetPodResourceClaims() []corev1.PodResourceClaim {
1012+
claims := make([]corev1.PodResourceClaim, len(n.Spec.DRAResources))
1013+
for idx, resource := range n.Spec.DRAResources {
1014+
claims[idx] = corev1.PodResourceClaim{
1015+
Name: resource.Name,
1016+
ResourceClaimName: resource.ResourceClaimName,
1017+
ResourceClaimTemplateName: resource.ResourceClaimTemplateName,
1018+
}
1019+
}
1020+
return claims
10031021
}
10041022

10051023
func init() {

api/apps/v1alpha1/zz_generated.deepcopy.go

Lines changed: 37 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/apps.nvidia.com_nimpipelines.yaml

Lines changed: 57 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,58 @@ spec:
101101
items:
102102
type: string
103103
type: array
104+
draResources:
105+
items:
106+
description: |-
107+
DRAResource references exactly one ResourceClaim, either directly
108+
or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim.
109+
110+
It adds a name to it that uniquely identifies the ResourceClaim.
111+
NIMService containers that need access to the ResourceClaim will automatically reference it with this name.
112+
properties:
113+
name:
114+
description: |-
115+
Name uniquely identifies this resource claim.
116+
This must be a DNS_LABEL.
117+
type: string
118+
requests:
119+
description: |-
120+
Requests is the list of requests in the referenced ResourceClaim/ResourceClaimTemplate
121+
to be made available to the model container of the NIMService pods.
122+
123+
If empty, everything from the claim is made available, otherwise
124+
only the result of this subset of requests.
125+
items:
126+
type: string
127+
type: array
128+
resourceClaimName:
129+
description: |-
130+
ResourceClaimName is the name of a ResourceClaim object in the same
131+
namespace as the NIMService.
132+
133+
Exactly one of ResourceClaimName and ResourceClaimTemplateName must
134+
be set.
135+
type: string
136+
resourceClaimTemplateName:
137+
description: |-
138+
ResourceClaimTemplateName is the name of a ResourceClaimTemplate
139+
object in the same namespace as the pods for this NIMService.
140+
141+
The template will be used to create a new ResourceClaim, which will
142+
be bound to the pods created for this NIMService. When the pod is deleted,
143+
the ResourceClaim will also be deleted. The pod name and resource name, along
144+
with a generated component, will be used to form a unique name for the
145+
ResourceClaim, which will be recorded in pod.status.resourceClaimStatuses.
146+
147+
Modifying this field will result in the NIMService going to Failed state.
148+
149+
Exactly one of ResourceClaimName and ResourceClaimTemplateName must
150+
be set.
151+
type: string
152+
required:
153+
- name
154+
type: object
155+
type: array
104156
env:
105157
items:
106158
description: EnvVar represents an environment variable
@@ -1321,54 +1373,12 @@ spec:
13211373
default: 1
13221374
minimum: 1
13231375
type: integer
1324-
resourceClaims:
1325-
items:
1326-
description: |-
1327-
PodResourceClaim references exactly one ResourceClaim, either directly
1328-
or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim
1329-
for the pod.
1330-
1331-
It adds a name to it that uniquely identifies the ResourceClaim inside the Pod.
1332-
Containers that need access to the ResourceClaim reference it with this name.
1333-
properties:
1334-
name:
1335-
description: |-
1336-
Name uniquely identifies this resource claim inside the pod.
1337-
This must be a DNS_LABEL.
1338-
type: string
1339-
resourceClaimName:
1340-
description: |-
1341-
ResourceClaimName is the name of a ResourceClaim object in the same
1342-
namespace as this pod.
1343-
1344-
Exactly one of ResourceClaimName and ResourceClaimTemplateName must
1345-
be set.
1346-
type: string
1347-
resourceClaimTemplateName:
1348-
description: |-
1349-
ResourceClaimTemplateName is the name of a ResourceClaimTemplate
1350-
object in the same namespace as this pod.
1351-
1352-
The template will be used to create a new ResourceClaim, which will
1353-
be bound to this pod. When this pod is deleted, the ResourceClaim
1354-
will also be deleted. The pod name and resource name, along with a
1355-
generated component, will be used to form a unique name for the
1356-
ResourceClaim, which will be recorded in pod.status.resourceClaimStatuses.
1357-
1358-
This field is immutable and no changes will be made to the
1359-
corresponding ResourceClaim by the control plane after creating the
1360-
ResourceClaim.
1361-
1362-
Exactly one of ResourceClaimName and ResourceClaimTemplateName must
1363-
be set.
1364-
type: string
1365-
required:
1366-
- name
1367-
type: object
1368-
type: array
13691376
resources:
1370-
description: ResourceRequirements describes the compute
1371-
resource requirements.
1377+
description: |-
1378+
Resources is the resource requirements for the NIMService deployment.
1379+
1380+
Note: Only traditional resources like cpu/memory and custom device plugin resources are supported here.
1381+
Any DRA claim references are ignored. Use DRAResources instead for those.
13721382
properties:
13731383
claims:
13741384
description: |-

0 commit comments

Comments
 (0)