Skip to content

Commit 781311f

Browse files
committed
feat: check resource quota in webhook
Signed-off-by: james <[email protected]>
1 parent 0754340 commit 781311f

File tree

2 files changed

+209
-0
lines changed

2 files changed

+209
-0
lines changed

pkg/scheduler/webhook.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
3030

3131
"github.com/Project-HAMi/HAMi/pkg/device"
32+
"github.com/Project-HAMi/HAMi/pkg/device/nvidia"
3233
"github.com/Project-HAMi/HAMi/pkg/scheduler/config"
3334
)
3435

@@ -96,10 +97,62 @@ func (h *webhook) Handle(_ context.Context, req admission.Request) admission.Res
9697
return admission.Denied("pod has node assigned")
9798
}
9899
}
100+
if !fitResourceQuota(pod) {
101+
return admission.Denied("exceeding resource quota")
102+
}
99103
marshaledPod, err := json.Marshal(pod)
100104
if err != nil {
101105
klog.Errorf(template+" - Failed to marshal pod, error: %v", pod.Namespace, pod.Name, pod.UID, err)
102106
return admission.Errored(http.StatusInternalServerError, err)
103107
}
104108
return admission.PatchResponseFromRaw(req.Object.Raw, marshaledPod)
105109
}
110+
111+
func fitResourceQuota(pod *corev1.Pod) bool {
112+
for deviceName, dev := range device.GetDevices() {
113+
// Only supports NVIDIA
114+
if deviceName != nvidia.NvidiaGPUDevice {
115+
continue
116+
}
117+
memoryFactor := nvidia.MemoryFactor
118+
resourceNames := dev.GetResourceNames()
119+
resourceName := corev1.ResourceName(corev1.ResourceName(resourceNames.ResourceCountName))
120+
memResourceName := corev1.ResourceName(corev1.ResourceName(resourceNames.ResourceMemoryName))
121+
coreResourceName := corev1.ResourceName(corev1.ResourceName(resourceNames.ResourceCoreName))
122+
var memoryReq int64 = 0
123+
var coresReq int64 = 0
124+
getRequest := func(ctr *corev1.Container, resName corev1.ResourceName) (int64, bool) {
125+
v, ok := ctr.Resources.Limits[resName]
126+
if !ok {
127+
v, ok = ctr.Resources.Requests[resName]
128+
}
129+
if ok {
130+
if n, ok := v.AsInt64(); ok {
131+
return n, true
132+
}
133+
}
134+
return 0, false
135+
}
136+
for _, ctr := range pod.Spec.Containers {
137+
req, ok := getRequest(&ctr, resourceName)
138+
if ok && req == 1 {
139+
if memReq, ok := getRequest(&ctr, memResourceName); ok {
140+
memoryReq += memReq
141+
}
142+
if coreReq, ok := getRequest(&ctr, coreResourceName); ok {
143+
coresReq += coreReq
144+
}
145+
}
146+
}
147+
if memoryFactor > 1 {
148+
oriMemReq := memoryReq
149+
memoryReq = memoryReq * int64(memoryFactor)
150+
klog.V(5).Infof("Adjusting memory request for quota check: oriMemReq %d, memoryReq %d, factor %d", oriMemReq, memoryReq, memoryFactor)
151+
}
152+
if !device.GetLocalCache().FitQuota(pod.Namespace, memoryReq, memoryFactor, coresReq, deviceName) {
153+
klog.Infof(template+" - Denying admission", pod.Namespace, pod.Name, pod.UID)
154+
return false
155+
}
156+
}
157+
return true
158+
}

pkg/scheduler/webhook_test.go

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"k8s.io/klog/v2"
3030
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
3131

32+
"github.com/Project-HAMi/HAMi/pkg/device"
3233
"github.com/Project-HAMi/HAMi/pkg/device/nvidia"
3334
"github.com/Project-HAMi/HAMi/pkg/scheduler/config"
3435
)
@@ -240,3 +241,158 @@ func TestPodHasDifferentScheduler(t *testing.T) {
240241
t.Errorf("Expected allowed response for pod with different scheduler, but got: %v", resp)
241242
}
242243
}
244+
245+
func TestFitResourceQuota(t *testing.T) {
246+
config.SchedulerName = "hami-scheduler"
247+
248+
sConfig := &config.Config{
249+
NvidiaConfig: nvidia.NvidiaConfig{
250+
ResourceCountName: "nvidia.com/gpu",
251+
ResourceMemoryName: "nvidia.com/gpumem",
252+
ResourceMemoryPercentageName: "nvidia.com/gpumem-percentage",
253+
ResourceCoreName: "nvidia.com/gpucores",
254+
DefaultMemory: 0,
255+
DefaultCores: 0,
256+
DefaultGPUNum: 1,
257+
MemoryFactor: 1,
258+
},
259+
}
260+
261+
if err := config.InitDevicesWithConfig(sConfig); err != nil {
262+
klog.Fatalf("Failed to initialize devices with config: %v", err)
263+
}
264+
265+
qm := device.NewQuotaManager()
266+
ns := "default"
267+
memName := "nvidia.com/gpumem"
268+
coreName := "nvidia.com/gpucores"
269+
270+
qm.Quotas[ns] = &device.DeviceQuota{
271+
memName: &device.Quota{Used: 1000, Limit: 2000},
272+
coreName: &device.Quota{Used: 200, Limit: 400},
273+
}
274+
275+
testCases := []struct {
276+
name string
277+
pod *corev1.Pod
278+
fit bool
279+
}{
280+
{
281+
name: "quota passed",
282+
pod: &corev1.Pod{
283+
ObjectMeta: metav1.ObjectMeta{
284+
Name: "test-pod",
285+
Namespace: "default",
286+
},
287+
Spec: corev1.PodSpec{
288+
SchedulerName: "hami-scheduler",
289+
Containers: []corev1.Container{
290+
{
291+
Name: "container1",
292+
SecurityContext: &corev1.SecurityContext{
293+
Privileged: nil,
294+
},
295+
Resources: corev1.ResourceRequirements{
296+
Limits: corev1.ResourceList{
297+
"nvidia.com/gpu": resource.MustParse("1"),
298+
"nvidia.com/gpumem": resource.MustParse("100"),
299+
},
300+
},
301+
},
302+
},
303+
},
304+
},
305+
fit: true,
306+
},
307+
{
308+
name: "quota exceeded",
309+
pod: &corev1.Pod{
310+
ObjectMeta: metav1.ObjectMeta{
311+
Name: "test-pod",
312+
Namespace: "default",
313+
},
314+
Spec: corev1.PodSpec{
315+
SchedulerName: "hami-scheduler",
316+
Containers: []corev1.Container{
317+
{
318+
Name: "container1",
319+
SecurityContext: &corev1.SecurityContext{
320+
Privileged: nil,
321+
},
322+
Resources: corev1.ResourceRequirements{
323+
Limits: corev1.ResourceList{
324+
"nvidia.com/gpu": resource.MustParse("1"),
325+
"nvidia.com/gpumem": resource.MustParse("1024"),
326+
},
327+
},
328+
},
329+
},
330+
},
331+
},
332+
fit: false,
333+
},
334+
{
335+
name: "request multiple gpus",
336+
pod: &corev1.Pod{
337+
ObjectMeta: metav1.ObjectMeta{
338+
Name: "test-pod",
339+
Namespace: "default",
340+
},
341+
Spec: corev1.PodSpec{
342+
SchedulerName: "hami-scheduler",
343+
Containers: []corev1.Container{
344+
{
345+
Name: "container1",
346+
SecurityContext: &corev1.SecurityContext{
347+
Privileged: nil,
348+
},
349+
Resources: corev1.ResourceRequirements{
350+
Limits: corev1.ResourceList{
351+
"nvidia.com/gpu": resource.MustParse("2"),
352+
"nvidia.com/gpumem": resource.MustParse("1024"),
353+
},
354+
},
355+
},
356+
},
357+
},
358+
},
359+
fit: true,
360+
},
361+
{
362+
name: "request ascend",
363+
pod: &corev1.Pod{
364+
ObjectMeta: metav1.ObjectMeta{
365+
Name: "test-pod",
366+
Namespace: "default",
367+
},
368+
Spec: corev1.PodSpec{
369+
SchedulerName: "hami-scheduler",
370+
Containers: []corev1.Container{
371+
{
372+
Name: "container1",
373+
SecurityContext: &corev1.SecurityContext{
374+
Privileged: nil,
375+
},
376+
Resources: corev1.ResourceRequirements{
377+
Limits: corev1.ResourceList{
378+
"huawei.com/Ascend910B": resource.MustParse("1"),
379+
"huawei.com/Ascend910B-memory": resource.MustParse("1024"),
380+
},
381+
},
382+
},
383+
},
384+
},
385+
},
386+
fit: true,
387+
},
388+
}
389+
390+
for _, tc := range testCases {
391+
t.Run(tc.name, func(t *testing.T) {
392+
result := fitResourceQuota(tc.pod)
393+
if tc.fit != result {
394+
t.Errorf("Expected %v, but got %v", tc.fit, result)
395+
}
396+
})
397+
}
398+
}

0 commit comments

Comments
 (0)