Skip to content

Commit 7de6fa3

Browse files
committed
fixup! Allow VPA updater to actuate recommendations in-place
Signed-off-by: Max Cao <[email protected]>
1 parent 40ad8c2 commit 7de6fa3

File tree

11 files changed

+238
-198
lines changed

11 files changed

+238
-198
lines changed

vertical-pod-autoscaler/deploy/vpa-rbac.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ rules:
129129
- ""
130130
resources:
131131
- pods/resize
132+
- pods
132133
verbs:
133134
- patch
134135
---

vertical-pod-autoscaler/e2e/v1/common.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,8 @@ func CheckNoPodsEvicted(f *framework.Framework, initialPodSet PodSet) {
455455
gomega.Expect(restarted).To(gomega.Equal(0), "there should be no pod evictions")
456456
}
457457

458+
// CheckNoContainersRestarted waits for long enough period for VPA to start
459+
// updating containers in-place and checks that no containers were restarted.
458460
func CheckNoContainersRestarted(f *framework.Framework) {
459461
var foundContainerRestarts int32
460462
time.Sleep(VpaEvictionTimeout)

vertical-pod-autoscaler/pkg/admission-controller/resource/pod/patch/resource_updates.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func getContainerPatch(pod *core.Pod, i int, annotationsPerContainer vpa_api_uti
7777
// Add empty resources object if missing.
7878
if pod.Spec.Containers[i].Resources.Limits == nil &&
7979
pod.Spec.Containers[i].Resources.Requests == nil {
80-
patches = append(patches, getPatchInitializingEmptyResources(i))
80+
patches = append(patches, GetPatchInitializingEmptyResources(i))
8181
}
8282

8383
annotations, found := annotationsPerContainer[pod.Spec.Containers[i].Name]
@@ -95,31 +95,31 @@ func getContainerPatch(pod *core.Pod, i int, annotationsPerContainer vpa_api_uti
9595
func appendPatchesAndAnnotations(patches []resource_admission.PatchRecord, annotations []string, current core.ResourceList, containerIndex int, resources core.ResourceList, fieldName, resourceName string) ([]resource_admission.PatchRecord, []string) {
9696
// Add empty object if it's missing and we're about to fill it.
9797
if current == nil && len(resources) > 0 {
98-
patches = append(patches, getPatchInitializingEmptyResourcesSubfield(containerIndex, fieldName))
98+
patches = append(patches, GetPatchInitializingEmptyResourcesSubfield(containerIndex, fieldName))
9999
}
100100
for resource, request := range resources {
101-
patches = append(patches, getAddResourceRequirementValuePatch(containerIndex, fieldName, resource, request))
101+
patches = append(patches, GetAddResourceRequirementValuePatch(containerIndex, fieldName, resource, request))
102102
annotations = append(annotations, fmt.Sprintf("%s %s", resource, resourceName))
103103
}
104104
return patches, annotations
105105
}
106106

107-
func getAddResourceRequirementValuePatch(i int, kind string, resource core.ResourceName, quantity resource.Quantity) resource_admission.PatchRecord {
107+
func GetAddResourceRequirementValuePatch(i int, kind string, resource core.ResourceName, quantity resource.Quantity) resource_admission.PatchRecord {
108108
return resource_admission.PatchRecord{
109109
Op: "add",
110110
Path: fmt.Sprintf("/spec/containers/%d/resources/%s/%s", i, kind, resource),
111111
Value: quantity.String()}
112112
}
113113

114-
func getPatchInitializingEmptyResources(i int) resource_admission.PatchRecord {
114+
func GetPatchInitializingEmptyResources(i int) resource_admission.PatchRecord {
115115
return resource_admission.PatchRecord{
116116
Op: "add",
117117
Path: fmt.Sprintf("/spec/containers/%d/resources", i),
118118
Value: core.ResourceRequirements{},
119119
}
120120
}
121121

122-
func getPatchInitializingEmptyResourcesSubfield(i int, kind string) resource_admission.PatchRecord {
122+
func GetPatchInitializingEmptyResourcesSubfield(i int, kind string) resource_admission.PatchRecord {
123123
return resource_admission.PatchRecord{
124124
Op: "add",
125125
Path: fmt.Sprintf("/spec/containers/%d/resources/%s", i, kind),

vertical-pod-autoscaler/pkg/updater/eviction/inplace_recommendation_provider.go

Lines changed: 0 additions & 138 deletions
This file was deleted.

vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go

Lines changed: 55 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,9 @@ func (f *podsEvictionRestrictionFactoryImpl) NewPodsEvictionRestriction(pods []*
291291
}
292292
if IsInPlaceUpdating(pod) {
293293
singleGroup.inPlaceUpdating = singleGroup.inPlaceUpdating + 1
294-
295294
}
296295
}
297296
singleGroup.running = len(replicas) - singleGroup.pending
298-
299-
// This has to happen last, singlegroup never gets returned, only this does
300297
creatorToSingleGroupStatsMap[creator] = singleGroup
301298

302299
}
@@ -460,8 +457,6 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
460457
return false
461458
}
462459

463-
// TODO(jkyros): is there a pod-level thing we can use?
464-
// Go through each container, and check to see if this is going to cause a disruption or not
465460
noRestartPoliciesPopulated := true
466461

467462
for _, container := range pod.Spec.Containers {
@@ -498,18 +493,31 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
498493
return false
499494
}
500495
// This second "present" check is against the creator-to-group-stats map, not the pod-to-replica map
496+
// TODO(maxcao13): Not sure, but do we need disruption tolerance for in-place updates? I guess we do since they are not guaranteed to be disruptionless...
497+
// TODO(maxcao13): If this is okay, I may have to rename evictionTolerance to disruption tolerance
501498
if present {
502-
klog.V(4).Infof("pod %s disruption tolerance run: %d config: %d tolerance: %d evicted: %d updating: %d", pod.Name, singleGroupStats.running, singleGroupStats.configured, singleGroupStats.evictionTolerance, singleGroupStats.evicted, singleGroupStats.inPlaceUpdating)
499+
klog.V(4).InfoS("Checking pod disruption tolerance",
500+
"podName", pod.Name,
501+
"configuredPods", singleGroupStats.configured,
502+
"runningPods", singleGroupStats.running,
503+
"evictedPods", singleGroupStats.evicted,
504+
"inPlaceUpdatingPods", singleGroupStats.inPlaceUpdating,
505+
"evictionTolerance", singleGroupStats.evictionTolerance,
506+
)
507+
// minimum number of pods that should be running to tolerate disruptions
503508
shouldBeAlive := singleGroupStats.configured - singleGroupStats.evictionTolerance
504-
if singleGroupStats.running-(singleGroupStats.evicted+singleGroupStats.inPlaceUpdating) > shouldBeAlive {
505-
klog.V(4).Infof("Should be alive: %d, Actually alive: %d", shouldBeAlive, singleGroupStats.running-(singleGroupStats.evicted+singleGroupStats.inPlaceUpdating))
509+
// number of pods that are actually running
510+
actuallyAlive := singleGroupStats.running - (singleGroupStats.evicted + singleGroupStats.inPlaceUpdating)
511+
if actuallyAlive > shouldBeAlive {
512+
klog.V(4).InfoS("Pod can be resized in-place; more pods are running than required", "podName", pod.Name, "shouldBeAlive", shouldBeAlive, "actuallyAlive", actuallyAlive)
506513
return true
507514
}
508-
// If all pods are running and eviction tolerance is small update 1 pod.
515+
516+
// If all pods are running, no pods are being evicted or updated, and eviction tolerance is small, we can resize in-place
509517
if singleGroupStats.running == singleGroupStats.configured &&
510518
singleGroupStats.evictionTolerance == 0 &&
511519
singleGroupStats.evicted == 0 && singleGroupStats.inPlaceUpdating == 0 {
512-
klog.V(4).Infof("--> we are in good shape on %s, it is tolerant", pod.Name)
520+
klog.V(4).InfoS("Pod can be resized in-place; all pods are running and eviction tolerance is 0", "podName", pod.Name)
513521
return true
514522
}
515523
}
@@ -526,43 +534,61 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
526534
return fmt.Errorf("pod not suitable for eviction %v : not in replicated pods map", podToUpdate.Name)
527535
}
528536

529-
if !e.CanInPlaceUpdate(podToUpdate) {
530-
return fmt.Errorf("cannot update pod %v in place : number of in-flight updates exceeded", podToUpdate.Name)
531-
}
537+
// TODO(maxcao13): Not sure if we need to check again here, but commenting it out for now in case we do
538+
// if !e.CanInPlaceUpdate(podToUpdate) {
539+
// return fmt.Errorf("cannot update pod %v in place : number of in-flight updates exceeded", podToUpdate.Name)
540+
// }
532541

533-
// TODO(maxcao13): There's probably a more efficient way to do this, but this is what we have for now
534-
// Don't reinvent the wheel, just use the resource updates patch calculator using by the admission controller
535-
// See the below TODO for refactoring this
536-
patches := []resource_updates.PatchRecord{}
542+
// TODO(maxcao13): There's maybe a more efficient way to do this, but this is what we have for now
543+
544+
// separate patches since we have to patch resize and spec separately
545+
resourcePatches := []resource_updates.PatchRecord{}
546+
annotationPatches := []resource_updates.PatchRecord{}
537547
if podToUpdate.Annotations == nil {
538-
patches = append(patches, patch.GetAddEmptyAnnotationsPatch())
548+
annotationPatches = append(annotationPatches, patch.GetAddEmptyAnnotationsPatch())
539549
}
540-
for _, calculator := range e.patchCalculators {
550+
for i, calculator := range e.patchCalculators {
541551
p, err := calculator.CalculatePatches(podToUpdate, vpa)
542552
if err != nil {
543553
return fmt.Errorf("failed to calculate resource patch for pod %s/%s: %v", podToUpdate.Namespace, podToUpdate.Name, err)
544554
}
545555
klog.V(4).InfoS("Calculated patches for pod", "pod", klog.KObj(podToUpdate), "patches", p)
546-
patches = append(patches, p...)
556+
// TODO(maxcao13): change how this works later, this is gross and depends on the resource calculator being first in the slice
557+
// we may not even want the updater to patch pod annotations at all
558+
if i == 0 {
559+
resourcePatches = append(resourcePatches, p...)
560+
} else {
561+
annotationPatches = append(annotationPatches, p...)
562+
}
547563
}
548-
549-
if len(patches) > 0 {
550-
patch, err := json.Marshal(patches)
564+
if len(resourcePatches) > 0 {
565+
patch, err := json.Marshal(resourcePatches)
551566
if err != nil {
552-
klog.Errorf("Cannot marshal the patch %v: %v", patches, err)
567+
klog.Errorf("Cannot marshal the patch %v: %v", resourcePatches, err)
553568
return err
554569
}
555570

556-
// TODO(maxcao13): for now, this does not allow other patches to be applied since we are patching the resize subresource
557-
// If we want other annotations to be patched in the pod using patchCalculators, we need to generalize this and refactor the
558-
// NewResourceUpdatesCalculator and create a stripped down calculator just for calculating in-place resize patches
559571
res, err := e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{}, "resize")
560572
if err != nil {
561573
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
562574
return err
563575
}
564576
klog.V(4).InfoS("In-place patched pod /resize subresource using patches ", "pod", klog.KObj(res), "patches", string(patch))
565577

578+
// TODO(maxcao13): whether or not we apply annotation patches should depend on resource patches?
579+
if len(annotationPatches) > 0 {
580+
patch, err := json.Marshal(annotationPatches)
581+
if err != nil {
582+
klog.Errorf("Cannot marshal the patch %v: %v", annotationPatches, err)
583+
return err
584+
}
585+
res, err = e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{})
586+
if err != nil {
587+
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
588+
return err
589+
}
590+
klog.V(4).InfoS("Patched pod annotations", "pod", klog.KObj(res), "patches", string(patch))
591+
}
566592
} else {
567593
err := fmt.Errorf("no patches to apply to %s", podToUpdate.Name)
568594
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
@@ -571,8 +597,8 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
571597

572598
// TODO(maxcao13): If this keeps getting called on the same object with the same reason, it is considered a patch request.
573599
// And we fail to have the corresponding rbac for it. So figure out if we need this later.
574-
eventRecorder.Event(podToUpdate, apiv1.EventTypeNormal, "InPlaceResizedByVPA",
575-
"Pod was resized in place by VPA Updater.")
600+
// TODO(maxcao13): Do we even need to emit an event? The api server might reject the resize request. Should we rename this to InPlaceResizeAttempted?
601+
// eventRecorder.Event(podToUpdate, apiv1.EventTypeNormal, "InPlaceResizedByVPA", "Pod was resized in place by VPA Updater.")
576602

577603
// TODO(jkyros): You need to do this regardless once you update the pod, if it changes phases here as a result, you still
578604
// need to catalog what you did
@@ -604,21 +630,5 @@ func IsInPlaceUpdating(podToCheck *apiv1.Pod) (isUpdating bool) {
604630
}
605631
return true
606632
}
607-
608-
// If any of the container resources don't match their spec, it's...updating but the lifecycle hasn't kicked in yet? So we
609-
// also need to mark that?
610-
/*
611-
for num, container := range podToCheck.Spec.Containers {
612-
// TODO(jkyros): supported resources only?
613-
// Resources can be nil, especially if the feature gate isn't on
614-
if podToCheck.Status.ContainerStatuses[num].Resources != nil {
615-
616-
if !reflect.DeepEqual(container.Resources, *podToCheck.Status.ContainerStatuses[num].Resources) {
617-
klog.V(4).Infof("Resize must be in progress for %s, resources for container %s don't match", podToCheck.Name, container.Name)
618-
return true
619-
}
620-
}
621-
}*/
622633
return false
623-
624634
}

0 commit comments

Comments
 (0)