Skip to content

Commit 161cee6

Browse files
committed
add resourceclaims to containers
Signed-off-by: Varun Ramachandra Sekar <vsekar@nvidia.com>
1 parent cb773eb commit 161cee6

File tree

4 files changed

+293
-3
lines changed

4 files changed

+293
-3
lines changed

internal/controller/platform/standalone/nimservice.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
299299
if len(initContainers) > 0 {
300300
result.Spec.Template.Spec.InitContainers = initContainers
301301
}
302+
// Update Container resources with DRA resource claims.
303+
shared.UpdateContainerResourceClaims(result.Spec.Template.Spec.Containers, nimService.Spec.ResourceClaims)
302304
return result, err
303305

304306
}, "deployment", conditions.ReasonDeploymentFailed)
@@ -708,6 +710,12 @@ func (r *NIMServiceReconciler) getTensorParallelismByProfile(ctx context.Context
708710
func (r *NIMServiceReconciler) assignGPUResources(ctx context.Context, nimService *appsv1alpha1.NIMService, profile *appsv1alpha1.NIMProfile, deploymentParams *rendertypes.DeploymentParams) error {
709711
logger := log.FromContext(ctx)
710712

713+
// TODO: Refine this to detect GPU claims and only assign GPU resources if no GPU claims are present.
714+
if len(nimService.GetResourceClaims()) > 0 {
715+
logger.Info("Resource claims found, skipping GPU resource assignment", "resourceClaims", nimService.GetResourceClaims())
716+
return nil
717+
}
718+
711719
// TODO: Make the resource name configurable
712720
const gpuResourceName = corev1.ResourceName("nvidia.com/gpu")
713721

internal/shared/resourceclaims.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package shared
18+
19+
import (
20+
corev1 "k8s.io/api/core/v1"
21+
)
22+
23+
// NVIDIA k8s-dra-driver-gpu device classes supported by NIM Operator.
24+
var gpuDeviceClassWhitelist = []string{"gpu.nvidia.com", "mig.nvidia.com"}
25+
26+
func UpdateContainerResourceClaims(containers []corev1.Container, resourceClaims []corev1.PodResourceClaim) {
27+
for _, rc := range resourceClaims {
28+
var found bool
29+
// Check if the resource claim is already referenced by a container.
30+
for _, container := range containers {
31+
for _, claim := range container.Resources.Claims {
32+
if claim.Name == rc.Name {
33+
found = true
34+
break
35+
}
36+
}
37+
}
38+
39+
// Add unreferenced resource claims to containers to prevent container-runtime from overprovisioning.
40+
if !found {
41+
for idx := range containers {
42+
containers[idx].Resources.Claims = append(containers[idx].Resources.Claims, corev1.ResourceClaim{
43+
Name: rc.Name,
44+
})
45+
}
46+
}
47+
}
48+
}
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
package shared
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
corev1 "k8s.io/api/core/v1"
8+
)
9+
10+
func TestUpdateContainerResourceClaims(t *testing.T) {
11+
tests := []struct {
12+
name string
13+
containers []corev1.Container
14+
resourceClaims []corev1.PodResourceClaim
15+
expected []corev1.Container
16+
}{
17+
{
18+
name: "add new resource claim to empty containers",
19+
containers: []corev1.Container{
20+
{
21+
Name: "container1",
22+
Resources: corev1.ResourceRequirements{
23+
Claims: []corev1.ResourceClaim{},
24+
},
25+
},
26+
},
27+
resourceClaims: []corev1.PodResourceClaim{
28+
{
29+
Name: "claim1",
30+
},
31+
},
32+
expected: []corev1.Container{
33+
{
34+
Name: "container1",
35+
Resources: corev1.ResourceRequirements{
36+
Claims: []corev1.ResourceClaim{
37+
{
38+
Name: "claim1",
39+
},
40+
},
41+
},
42+
},
43+
},
44+
},
45+
{
46+
name: "add new resource claim to containers with existing claims",
47+
containers: []corev1.Container{
48+
{
49+
Name: "container1",
50+
Resources: corev1.ResourceRequirements{
51+
Claims: []corev1.ResourceClaim{
52+
{
53+
Name: "existing-claim",
54+
},
55+
},
56+
},
57+
},
58+
},
59+
resourceClaims: []corev1.PodResourceClaim{
60+
{
61+
Name: "new-claim",
62+
},
63+
},
64+
expected: []corev1.Container{
65+
{
66+
Name: "container1",
67+
Resources: corev1.ResourceRequirements{
68+
Claims: []corev1.ResourceClaim{
69+
{
70+
Name: "existing-claim",
71+
},
72+
{
73+
Name: "new-claim",
74+
},
75+
},
76+
},
77+
},
78+
},
79+
},
80+
{
81+
name: "do not add duplicate resource claims",
82+
containers: []corev1.Container{
83+
{
84+
Name: "container1",
85+
Resources: corev1.ResourceRequirements{
86+
Claims: []corev1.ResourceClaim{
87+
{
88+
Name: "existing-claim",
89+
},
90+
},
91+
},
92+
},
93+
},
94+
resourceClaims: []corev1.PodResourceClaim{
95+
{
96+
Name: "existing-claim",
97+
},
98+
},
99+
expected: []corev1.Container{
100+
{
101+
Name: "container1",
102+
Resources: corev1.ResourceRequirements{
103+
Claims: []corev1.ResourceClaim{
104+
{
105+
Name: "existing-claim",
106+
},
107+
},
108+
},
109+
},
110+
},
111+
},
112+
{
113+
name: "add multiple resource claims to multiple containers",
114+
containers: []corev1.Container{
115+
{
116+
Name: "container1",
117+
Resources: corev1.ResourceRequirements{
118+
Claims: []corev1.ResourceClaim{},
119+
},
120+
},
121+
{
122+
Name: "container2",
123+
Resources: corev1.ResourceRequirements{
124+
Claims: []corev1.ResourceClaim{},
125+
},
126+
},
127+
},
128+
resourceClaims: []corev1.PodResourceClaim{
129+
{
130+
Name: "claim1",
131+
},
132+
{
133+
Name: "claim2",
134+
},
135+
},
136+
expected: []corev1.Container{
137+
{
138+
Name: "container1",
139+
Resources: corev1.ResourceRequirements{
140+
Claims: []corev1.ResourceClaim{
141+
{
142+
Name: "claim1",
143+
},
144+
{
145+
Name: "claim2",
146+
},
147+
},
148+
},
149+
},
150+
{
151+
Name: "container2",
152+
Resources: corev1.ResourceRequirements{
153+
Claims: []corev1.ResourceClaim{
154+
{
155+
Name: "claim1",
156+
},
157+
{
158+
Name: "claim2",
159+
},
160+
},
161+
},
162+
},
163+
},
164+
},
165+
{
166+
name: "containers with different claims should not get duplicates",
167+
containers: []corev1.Container{
168+
{
169+
Name: "container1",
170+
Resources: corev1.ResourceRequirements{
171+
Claims: []corev1.ResourceClaim{
172+
{
173+
Name: "claim1",
174+
},
175+
},
176+
},
177+
},
178+
{
179+
Name: "container2",
180+
Resources: corev1.ResourceRequirements{
181+
Claims: []corev1.ResourceClaim{
182+
{
183+
Name: "claim2",
184+
},
185+
},
186+
},
187+
},
188+
},
189+
resourceClaims: []corev1.PodResourceClaim{
190+
{
191+
Name: "claim1",
192+
},
193+
{
194+
Name: "claim2",
195+
},
196+
},
197+
expected: []corev1.Container{
198+
{
199+
Name: "container1",
200+
Resources: corev1.ResourceRequirements{
201+
Claims: []corev1.ResourceClaim{
202+
{
203+
Name: "claim1",
204+
},
205+
},
206+
},
207+
},
208+
{
209+
Name: "container2",
210+
Resources: corev1.ResourceRequirements{
211+
Claims: []corev1.ResourceClaim{
212+
{
213+
Name: "claim2",
214+
},
215+
},
216+
},
217+
},
218+
},
219+
},
220+
}
221+
222+
for _, tt := range tests {
223+
t.Run(tt.name, func(t *testing.T) {
224+
// Create a copy of the containers to avoid modifying the test data
225+
containers := make([]corev1.Container, len(tt.containers))
226+
copy(containers, tt.containers)
227+
228+
UpdateContainerResourceClaims(containers, tt.resourceClaims)
229+
230+
// Compare the results
231+
assert.Equal(t, tt.expected, containers)
232+
})
233+
}
234+
}

manifests/deployment.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ spec:
103103
{{ . | yaml | nindent 10 }}
104104
{{- end }}
105105
{{- if .PodResourceClaims }}
106-
resourceClaims:
107-
{{- .PodResourceClaims | yaml | nindent 10 }}
108-
{{- end }}
106+
resourceClaims:
107+
{{- .PodResourceClaims | yaml | nindent 10 }}
108+
{{- end }}
109109
volumes:
110110
{{- range .Volumes }}
111111
- name: {{ .Name }}

0 commit comments

Comments
 (0)