Skip to content

Commit 1a62ecf

Browse files
Merge branch 'main' into buildable-profile
2 parents 06bb936 + 51169e1 commit 1a62ecf

File tree

17 files changed

+567
-263
lines changed

17 files changed

+567
-263
lines changed

api/apps/v1alpha1/common_types.go

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -308,17 +308,22 @@ type DRAResource struct {
308308
}
309309

310310
// DRAResourceStatus defines the status of the DRAResource.
311+
// +kubebuilder:validation:XValidation:rule="has(self.resourceClaimStatus) != has(self.resourceClaimTemplateStatus)",message="exactly one of resourceClaimStatus and resourceClaimTemplateStatus must be set."
311312
type DRAResourceStatus struct {
313+
// Name is the pod claim name referenced in the pod spec as `spec.resourceClaims[].name` for this DRA resource.
312314
Name string `json:"name"`
313-
// ResourceClaimTemplateName is the name of the ResourceClaimTemplate that was
314-
// used to generate the ResourceClaim for an instance of NIMService.
315-
ResourceClaimTemplateName *string `json:"resourceClaimTemplateName,omitempty"`
316-
// ResourceClaims is the status of resource claims.
317-
ResourceClaims []DRAResourceClaimStatus `json:"resourceClaims,omitempty"`
315+
// ResourceClaimStatus is the status of the resource claim in this DRA resource.
316+
//
317+
// Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
318+
ResourceClaimStatus *DRAResourceClaimStatusInfo `json:"resourceClaimStatus,omitempty"`
319+
// ResourceClaimTemplateStatus is the status of the resource claim template in this DRA resource.
320+
//
321+
// Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
322+
ResourceClaimTemplateStatus *DRAResourceClaimTemplateStatusInfo `json:"resourceClaimTemplateStatus,omitempty"`
318323
}
319324

320-
// DRAResourceClaimStatus defines the status of the DRAResourceClaim.
321-
type DRAResourceClaimStatus struct {
325+
// DRAResourceClaimStatusInfo defines the status of a ResourceClaim referenced in the DRAResource.
326+
type DRAResourceClaimStatusInfo struct {
322327
// Name is the name of the ResourceClaim.
323328
Name string `json:"name"`
324329
// State is the state of the ResourceClaim.
@@ -331,3 +336,11 @@ type DRAResourceClaimStatus struct {
331336
// +kubebuilder:validation:default=pending
332337
State string `json:"state"`
333338
}
339+
340+
// DRAResourceClaimTemplateStatusInfo defines the status of a ResourceClaimTemplate referenced in the DRAResource.
341+
type DRAResourceClaimTemplateStatusInfo struct {
342+
// Name is the name of the resource claim template.
343+
Name string `json:"name"`
344+
// ResourceClaimStatuses is the statuses of the generated resource claims from this resource claim template.
345+
ResourceClaimStatuses []DRAResourceClaimStatusInfo `json:"resourceClaimStatuses,omitempty"`
346+
}

api/apps/v1alpha1/nemo_customizer_types.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,11 @@ func (n *NemoCustomizer) GetRoleParams() *rendertypes.RoleParams {
866866
Resources: []string{"pods", "persistentvolumeclaims", "services", "configmaps"},
867867
Verbs: []string{"create", "get", "list", "watch", "delete"},
868868
},
869+
{
870+
APIGroups: []string{""},
871+
Resources: []string{"events"},
872+
Verbs: []string{"create", "get", "list", "watch"},
873+
},
869874
{
870875
APIGroups: []string{"nvidia.com"},
871876
Resources: []string{"nemotrainingjobs", "nemotrainingjobs/status", "nemoentityhandlers"},
@@ -896,8 +901,20 @@ func (n *NemoCustomizer) GetRoleParams() *rendertypes.RoleParams {
896901
},
897902
}
898903

899-
if n.Spec.Scheduler.Type == SchedulerTypeVolcano {
904+
runAIRules := []rbacv1.PolicyRule{
905+
{
906+
APIGroups: []string{"run.ai"},
907+
Resources: []string{"trainingworkloads", "runaijobs"},
908+
Verbs: []string{"create", "get", "list", "watch", "update", "delete", "patch"},
909+
},
910+
}
911+
912+
// Add scheduler specific rules
913+
switch n.Spec.Scheduler.Type {
914+
case SchedulerTypeVolcano:
900915
params.Rules = append(params.Rules, volcanoRules...)
916+
case SchedulerTypeRunAI:
917+
params.Rules = append(params.Rules, runAIRules...)
901918
}
902919

903920
return params

api/apps/v1alpha1/zz_generated.deepcopy.go

Lines changed: 31 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/apps.nvidia.com_nimservices.yaml

Lines changed: 63 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2363,38 +2363,74 @@ spec:
23632363
description: DRAResourceStatus defines the status of the DRAResource.
23642364
properties:
23652365
name:
2366+
description: Name is the pod claim name referenced in the pod
2367+
spec as `spec.resourceClaims[].name` for this DRA resource.
23662368
type: string
2367-
resourceClaimTemplateName:
2369+
resourceClaimStatus:
23682370
description: |-
2369-
ResourceClaimTemplateName is the name of the ResourceClaimTemplate that was
2370-
used to generate the ResourceClaim for an instance of NIMService.
2371-
type: string
2372-
resourceClaims:
2373-
description: ResourceClaims is the status of resource claims.
2374-
items:
2375-
description: DRAResourceClaimStatus defines the status of
2376-
the DRAResourceClaim.
2377-
properties:
2378-
name:
2379-
description: Name is the name of the ResourceClaim.
2380-
type: string
2381-
state:
2382-
description: |-
2383-
State is the state of the ResourceClaim.
2384-
* pending: the resource claim is pending allocation.
2385-
* deleted: the resource claim has a deletion timestamp set but is not yet finalized.
2386-
* allocated: the resource claim is allocated to a pod.
2387-
* reserved: the resource claim is consumed by a pod.
2388-
This field will have one or more of the above values depending on the status of the resource claim.
2389-
type: string
2390-
required:
2391-
- name
2392-
- state
2393-
type: object
2394-
type: array
2371+
ResourceClaimStatus is the status of the resource claim in this DRA resource.
2372+
2373+
Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
2374+
properties:
2375+
name:
2376+
description: Name is the name of the ResourceClaim.
2377+
type: string
2378+
state:
2379+
description: |-
2380+
State is the state of the ResourceClaim.
2381+
* pending: the resource claim is pending allocation.
2382+
* deleted: the resource claim has a deletion timestamp set but is not yet finalized.
2383+
* allocated: the resource claim is allocated to a pod.
2384+
* reserved: the resource claim is consumed by a pod.
2385+
This field will have one or more of the above values depending on the status of the resource claim.
2386+
type: string
2387+
required:
2388+
- name
2389+
- state
2390+
type: object
2391+
resourceClaimTemplateStatus:
2392+
description: |-
2393+
ResourceClaimTemplateStatus is the status of the resource claim template in this DRA resource.
2394+
2395+
Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
2396+
properties:
2397+
name:
2398+
description: Name is the name of the resource claim template.
2399+
type: string
2400+
resourceClaimStatuses:
2401+
description: ResourceClaimStatuses is the statuses of the
2402+
generated resource claims from this resource claim template.
2403+
items:
2404+
description: DRAResourceClaimStatusInfo defines the status
2405+
of a ResourceClaim referenced in the DRAResource.
2406+
properties:
2407+
name:
2408+
description: Name is the name of the ResourceClaim.
2409+
type: string
2410+
state:
2411+
description: |-
2412+
State is the state of the ResourceClaim.
2413+
* pending: the resource claim is pending allocation.
2414+
* deleted: the resource claim has a deletion timestamp set but is not yet finalized.
2415+
* allocated: the resource claim is allocated to a pod.
2416+
* reserved: the resource claim is consumed by a pod.
2417+
This field will have one or more of the above values depending on the status of the resource claim.
2418+
type: string
2419+
required:
2420+
- name
2421+
- state
2422+
type: object
2423+
type: array
2424+
required:
2425+
- name
2426+
type: object
23952427
required:
23962428
- name
23972429
type: object
2430+
x-kubernetes-validations:
2431+
- message: exactly one of resourceClaimStatus and resourceClaimTemplateStatus
2432+
must be set.
2433+
rule: has(self.resourceClaimStatus) != has(self.resourceClaimTemplateStatus)
23982434
type: array
23992435
x-kubernetes-list-map-keys:
24002436
- name

bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,9 +744,12 @@ spec:
744744
resources:
745745
- events
746746
verbs:
747+
- get
747748
- create
748749
- patch
749750
- update
751+
- list
752+
- watch
750753
clusterPermissions:
751754
- serviceAccountName: k8s-nim-operator
752755
rules:
@@ -763,9 +766,12 @@ spec:
763766
resources:
764767
- events
765768
verbs:
769+
- get
766770
- create
767771
- patch
768772
- update
773+
- list
774+
- watch
769775
- apiGroups:
770776
- ''
771777
resources:
@@ -1206,6 +1212,18 @@ spec:
12061212
- get
12071213
- list
12081214
- watch
1215+
- apiGroups:
1216+
- run.ai
1217+
resources:
1218+
- trainingworkloads
1219+
- runaijobs
1220+
verbs:
1221+
- get
1222+
- list
1223+
- watch
1224+
- delete
1225+
- patch
1226+
- update
12091227
- apiGroups:
12101228
- nvidia.com
12111229
resources:

config/crd/bases/apps.nvidia.com_nimservices.yaml

Lines changed: 63 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2363,38 +2363,74 @@ spec:
23632363
description: DRAResourceStatus defines the status of the DRAResource.
23642364
properties:
23652365
name:
2366+
description: Name is the pod claim name referenced in the pod
2367+
spec as `spec.resourceClaims[].name` for this DRA resource.
23662368
type: string
2367-
resourceClaimTemplateName:
2369+
resourceClaimStatus:
23682370
description: |-
2369-
ResourceClaimTemplateName is the name of the ResourceClaimTemplate that was
2370-
used to generate the ResourceClaim for an instance of NIMService.
2371-
type: string
2372-
resourceClaims:
2373-
description: ResourceClaims is the status of resource claims.
2374-
items:
2375-
description: DRAResourceClaimStatus defines the status of
2376-
the DRAResourceClaim.
2377-
properties:
2378-
name:
2379-
description: Name is the name of the ResourceClaim.
2380-
type: string
2381-
state:
2382-
description: |-
2383-
State is the state of the ResourceClaim.
2384-
* pending: the resource claim is pending allocation.
2385-
* deleted: the resource claim has a deletion timestamp set but is not yet finalized.
2386-
* allocated: the resource claim is allocated to a pod.
2387-
* reserved: the resource claim is consumed by a pod.
2388-
This field will have one or more of the above values depending on the status of the resource claim.
2389-
type: string
2390-
required:
2391-
- name
2392-
- state
2393-
type: object
2394-
type: array
2371+
ResourceClaimStatus is the status of the resource claim in this DRA resource.
2372+
2373+
Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
2374+
properties:
2375+
name:
2376+
description: Name is the name of the ResourceClaim.
2377+
type: string
2378+
state:
2379+
description: |-
2380+
State is the state of the ResourceClaim.
2381+
* pending: the resource claim is pending allocation.
2382+
* deleted: the resource claim has a deletion timestamp set but is not yet finalized.
2383+
* allocated: the resource claim is allocated to a pod.
2384+
* reserved: the resource claim is consumed by a pod.
2385+
This field will have one or more of the above values depending on the status of the resource claim.
2386+
type: string
2387+
required:
2388+
- name
2389+
- state
2390+
type: object
2391+
resourceClaimTemplateStatus:
2392+
description: |-
2393+
ResourceClaimTemplateStatus is the status of the resource claim template in this DRA resource.
2394+
2395+
Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
2396+
properties:
2397+
name:
2398+
description: Name is the name of the resource claim template.
2399+
type: string
2400+
resourceClaimStatuses:
2401+
description: ResourceClaimStatuses is the statuses of the
2402+
generated resource claims from this resource claim template.
2403+
items:
2404+
description: DRAResourceClaimStatusInfo defines the status
2405+
of a ResourceClaim referenced in the DRAResource.
2406+
properties:
2407+
name:
2408+
description: Name is the name of the ResourceClaim.
2409+
type: string
2410+
state:
2411+
description: |-
2412+
State is the state of the ResourceClaim.
2413+
* pending: the resource claim is pending allocation.
2414+
* deleted: the resource claim has a deletion timestamp set but is not yet finalized.
2415+
* allocated: the resource claim is allocated to a pod.
2416+
* reserved: the resource claim is consumed by a pod.
2417+
This field will have one or more of the above values depending on the status of the resource claim.
2418+
type: string
2419+
required:
2420+
- name
2421+
- state
2422+
type: object
2423+
type: array
2424+
required:
2425+
- name
2426+
type: object
23952427
required:
23962428
- name
23972429
type: object
2430+
x-kubernetes-validations:
2431+
- message: exactly one of resourceClaimStatus and resourceClaimTemplateStatus
2432+
must be set.
2433+
rule: has(self.resourceClaimStatus) != has(self.resourceClaimTemplateStatus)
23982434
type: array
23992435
x-kubernetes-list-map-keys:
24002436
- name

0 commit comments

Comments
 (0)