Skip to content

Commit 57c2efa

Browse files
committed
VPA: fix logs and cleanup TODOs according to review
Signed-off-by: Max Cao <[email protected]>
1 parent d2c0c9a commit 57c2efa

File tree

7 files changed

+169
-180
lines changed

7 files changed

+169
-180
lines changed

vertical-pod-autoscaler/e2e/v1/actuation.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ var _ = ActuationSuiteE2eDescribe("Actuation", func() {
415415

416416
InstallVPA(f, vpaCRD)
417417

418-
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaEvictionTimeout.String()))
418+
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaInPlaceTimeout.String()))
419419
CheckNoContainersRestarted(f)
420420

421421
ginkgo.By("Waiting for pods to be evicted")
@@ -457,7 +457,7 @@ var _ = ActuationSuiteE2eDescribe("Actuation", func() {
457457

458458
InstallVPA(f, vpaCRD)
459459

460-
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaEvictionTimeout.String()))
460+
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaInPlaceTimeout.String()))
461461
CheckNoContainersRestarted(f)
462462

463463
ginkgo.By("Waiting for pods to be evicted")
@@ -499,7 +499,7 @@ var _ = ActuationSuiteE2eDescribe("Actuation", func() {
499499

500500
InstallVPA(f, vpaCRD)
501501

502-
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaEvictionTimeout.String()))
502+
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaInPlaceTimeout.String()))
503503
CheckNoContainersRestarted(f)
504504

505505
ginkgo.By("Waiting for pods to be evicted")

vertical-pod-autoscaler/e2e/v1/common.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ const (
5353
// VpaEvictionTimeout is a timeout for VPA to restart a pod if there are no
5454
// mechanisms blocking it (for example PDB).
5555
VpaEvictionTimeout = 3 * time.Minute
56+
// VpaInPlaceTimeout is a timeout for the VPA to finish in-place resizing a
57+
// pod (time for vpa to request inplace -> InProgress -> done)
58+
VpaInPlaceTimeout = 2 * time.Minute
5659

5760
defaultHamsterReplicas = int32(3)
5861
defaultHamsterBackoffLimit = int32(10)
@@ -459,7 +462,7 @@ func CheckNoPodsEvicted(f *framework.Framework, initialPodSet PodSet) {
459462
// updating containers in-place and checks that no containers were restarted.
460463
func CheckNoContainersRestarted(f *framework.Framework) {
461464
var foundContainerRestarts int32
462-
time.Sleep(VpaEvictionTimeout)
465+
time.Sleep(VpaInPlaceTimeout)
463466
podList, err := GetHamsterPods(f)
464467
for _, pod := range podList.Items {
465468
containerRestarts := getContainerRestarts(pod.Status)

vertical-pod-autoscaler/e2e/v1/updater.go

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ var _ = UpdaterE2eDescribe("Updater", func() {
8181
ginkgo.By("Waiting for pods to be in-place updated")
8282

8383
//gomega.Expect(err).NotTo(gomega.HaveOccurred())
84+
// TODO(maxcao13): I don't think we need this much complexity for checking inplace, but I won't remove it for now
8485
err := WaitForPodsUpdatedWithoutEviction(f, initialPods, podList)
8586
gomega.Expect(err).NotTo(gomega.HaveOccurred())
8687
})
@@ -120,6 +121,45 @@ var _ = UpdaterE2eDescribe("Updater", func() {
120121
gomega.Expect(err).NotTo(gomega.HaveOccurred())
121122
})
122123

124+
ginkgo.It("does not in-place update pods when there is no recommendation", func() {
125+
const statusUpdateInterval = 10 * time.Second
126+
127+
ginkgo.By("Setting up the Admission Controller status")
128+
stopCh := make(chan struct{})
129+
statusUpdater := status.NewUpdater(
130+
f.ClientSet,
131+
status.AdmissionControllerStatusName,
132+
status.AdmissionControllerStatusNamespace,
133+
statusUpdateInterval,
134+
"e2e test",
135+
)
136+
defer func() {
137+
// Schedule a cleanup of the Admission Controller status.
138+
// Status is created outside the test namespace.
139+
ginkgo.By("Deleting the Admission Controller status")
140+
close(stopCh)
141+
err := f.ClientSet.CoordinationV1().Leases(status.AdmissionControllerStatusNamespace).
142+
Delete(context.TODO(), status.AdmissionControllerStatusName, metav1.DeleteOptions{})
143+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
144+
}()
145+
statusUpdater.Run(stopCh)
146+
147+
podList := setupPodsForUpscalingWithoutRecommendation(f)
148+
if len(podList.Items[0].Spec.Containers[0].ResizePolicy) <= 0 {
149+
// Feature is probably not working here
150+
ginkgo.Skip("Skipping test, InPlacePodVerticalScaling not available")
151+
}
152+
153+
ginkgo.By(fmt.Sprintf("Waiting for pods to be in-place updated, hoping it won't happen, sleep for %s", VpaInPlaceTimeout.String()))
154+
CheckNoContainersRestarted(f)
155+
156+
updatedPodList, err := GetHamsterPods(f)
157+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
158+
for _, pod := range updatedPodList.Items {
159+
gomega.Expect(getCPURequest(pod.Spec)).To(gomega.Equal(ParseQuantityOrDie("100m")))
160+
}
161+
})
162+
123163
ginkgo.It("evicts pods when Admission Controller status available", func() {
124164
const statusUpdateInterval = 10 * time.Second
125165

@@ -265,14 +305,18 @@ func setupPodsForEviction(f *framework.Framework, hamsterCPU, hamsterMemory stri
265305
}
266306

267307
func setupPodsForUpscalingInPlace(f *framework.Framework) *apiv1.PodList {
268-
return setupPodsForInPlace(f, "100m", "100Mi", nil)
308+
return setupPodsForInPlace(f, "100m", "100Mi", nil, true)
269309
}
270310

271311
func setupPodsForDownscalingInPlace(f *framework.Framework, er []*vpa_types.EvictionRequirement) *apiv1.PodList {
272-
return setupPodsForInPlace(f, "500m", "500Mi", er)
312+
return setupPodsForInPlace(f, "500m", "500Mi", er, true)
313+
}
314+
315+
func setupPodsForUpscalingWithoutRecommendation(f *framework.Framework) *apiv1.PodList {
316+
return setupPodsForInPlace(f, "100m", "100Mi", nil, false)
273317
}
274318

275-
func setupPodsForInPlace(f *framework.Framework, hamsterCPU, hamsterMemory string, er []*vpa_types.EvictionRequirement) *apiv1.PodList {
319+
func setupPodsForInPlace(f *framework.Framework, hamsterCPU, hamsterMemory string, er []*vpa_types.EvictionRequirement, withRecommendation bool) *apiv1.PodList {
276320
controller := &autoscaling.CrossVersionObjectReference{
277321
APIVersion: "apps/v1",
278322
Kind: "Deployment",
@@ -285,22 +329,25 @@ func setupPodsForInPlace(f *framework.Framework, hamsterCPU, hamsterMemory strin
285329

286330
ginkgo.By("Setting up a VPA CRD")
287331
containerName := GetHamsterContainerNameByIndex(0)
288-
vpaCRD := test.VerticalPodAutoscaler().
332+
vpaBuilder := test.VerticalPodAutoscaler().
289333
WithName("hamster-vpa").
290334
WithNamespace(f.Namespace.Name).
291335
WithTargetRef(controller).
292336
WithUpdateMode(vpa_types.UpdateModeInPlaceOrRecreate).
293337
WithEvictionRequirements(er).
294-
WithContainer(containerName).
295-
AppendRecommendation(
338+
WithContainer(containerName)
339+
340+
if withRecommendation {
341+
vpaBuilder = vpaBuilder.AppendRecommendation(
296342
test.Recommendation().
297343
WithContainer(containerName).
298344
WithTarget(containerName, "200m").
299345
WithLowerBound(containerName, "200m").
300346
WithUpperBound(containerName, "200m").
301-
GetContainerResources()).
302-
Get()
347+
GetContainerResources())
348+
}
303349

350+
vpaCRD := vpaBuilder.Get()
304351
InstallVPA(f, vpaCRD)
305352

306353
return podList

vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go

Lines changed: 28 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -124,20 +124,19 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool {
124124
// they might cause disruption. We assume pods will not be both in-place updated and evicted in the same pass, but
125125
// we need eviction to take the numbers into account so we don't violate our disruption dolerances.
126126
// If we're already resizing this pod, don't do anything to it, unless we failed to resize it, then we want to evict it.
127+
klog.V(4).InfoS("Pod disruption tolerance",
128+
"pod", klog.KObj(pod),
129+
"running", singleGroupStats.running,
130+
"configured", singleGroupStats.configured,
131+
"tolerance", singleGroupStats.evictionTolerance,
132+
"evicted", singleGroupStats.evicted,
133+
"updating", singleGroupStats.inPlaceUpdating)
127134
if IsInPlaceUpdating(pod) {
128-
klog.V(4).InfoS("Pod disruption tolerance",
129-
"pod", pod.Name,
130-
"running", singleGroupStats.running,
131-
"configured", singleGroupStats.configured,
132-
"tolerance", singleGroupStats.evictionTolerance,
133-
"evicted", singleGroupStats.evicted,
134-
"updating", singleGroupStats.inPlaceUpdating)
135-
136135
if singleGroupStats.running-(singleGroupStats.evicted+(singleGroupStats.inPlaceUpdating-1)) > shouldBeAlive {
137-
klog.V(4).Infof("Would be able to evict, but already resizing %s", pod.Name)
136+
klog.V(4).InfoS("Would be able to evict, but already resizing", "pod", klog.KObj(pod))
138137

139138
if pod.Status.Resize == apiv1.PodResizeStatusInfeasible || pod.Status.Resize == apiv1.PodResizeStatusDeferred {
140-
klog.Warningf("Attempted in-place resize of %s impossible, should now evict", pod.Name)
139+
klog.InfoS("Attempted in-place resize was impossible, should now evict", "pod", klog.KObj(pod), "resizePolicy", pod.Status.Resize)
141140
return true
142141
}
143142
}
@@ -442,74 +441,67 @@ func setUpInformer(kubeClient kube_client.Interface, kind controllerKind) (cache
442441

443442
// CanInPlaceUpdate performs the same checks
444443
func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
445-
446444
cr, present := e.podToReplicaCreatorMap[GetPodID(pod)]
447445
if present {
448-
449446
// If our QoS class is guaranteed, we can't change the resources without a restart
450-
// TODO(maxcao13): kubelet already prevents a resize of a guaranteed pod, so should we still check this early?
451447
if pod.Status.QOSClass == apiv1.PodQOSGuaranteed {
452-
klog.Warningf("Can't resize %s in-place, pod QoS is %s", pod.Name, pod.Status.QOSClass)
453448
return false
454449
}
455450

456-
// If we're already resizing this pod, don't do it again
457451
if IsInPlaceUpdating(pod) {
458-
klog.Warningf("Not resizing %s, already resizing it", pod.Name)
459452
return false
460453
}
461454

462455
noRestartPoliciesPopulated := true
456+
isPodRestartPolicyNever := pod.Spec.RestartPolicy == apiv1.RestartPolicyNever
463457

464458
for _, container := range pod.Spec.Containers {
465459
// If some of these are populated, we know it at least understands resizing
466460
if len(container.ResizePolicy) > 0 {
467461
noRestartPoliciesPopulated = false
468462
}
469463

470-
// TODO(maxcao13): Do we have to check the policy resource too? i.e. if only memory is getting scaled, then only check the memory resize policy?
471464
for _, policy := range container.ResizePolicy {
472465
if policy.RestartPolicy != apiv1.NotRequired {
473-
klog.Warningf("in-place resize of %s will cause container disruption, container %s restart policy is %v", pod.Name, container.Name, policy.RestartPolicy)
466+
klog.V(4).InfoS("in-place resize of pod will cause container disruption, because of container resize policy", "pod", klog.KObj(pod), "container", container.Name, "containerResizeRestartPolicy", policy.RestartPolicy)
474467
// TODO(jkyros): is there something that prevents this from happening elsewhere in the API?
475-
if pod.Spec.RestartPolicy == apiv1.RestartPolicyNever {
476-
klog.Warningf("in-place resize of %s not possible, container %s resize policy is %v but pod restartPolicy is %v", pod.Name, container.Name, policy.RestartPolicy, pod.Spec.RestartPolicy)
468+
if isPodRestartPolicyNever {
469+
klog.InfoS("in-place resize of pod not possible, container resize policy and pod restartPolicy conflict", "pod", klog.KObj(pod), "container", container.Name, "containerResizeRestartPolicy", policy.RestartPolicy, "podRestartPolicy", pod.Spec.RestartPolicy)
477470
return false
478471
}
479-
480472
}
481473
}
482474
}
483475

484476
// If none of the policies are populated, our feature is probably not enabled, so we can't in-place regardless
485477
if noRestartPoliciesPopulated {
486-
klog.Warningf("impossible to resize %s in-place, container resize policies are not populated", pod.Name)
478+
klog.InfoS("impossible to resize pod in-place, container resize policies are not populated", "pod", klog.KObj(pod))
487479
}
488480

489481
//TODO(jkyros): Come back and handle sidecar containers at some point since they're weird?
490482
singleGroupStats, present := e.creatorToSingleGroupStatsMap[cr]
491483
// If we're pending, we can't in-place resize
492484
// TODO(jkyros): are we sure we can't? Should I just set this to "if running"?
493485
if pod.Status.Phase == apiv1.PodPending {
494-
klog.V(4).Infof("Can't resize pending pod %s", pod.Name)
486+
klog.V(4).InfoS("Can't resize pending pod", "pod", klog.KObj(pod))
495487
return false
496488
}
497-
// This second "present" check is against the creator-to-group-stats map, not the pod-to-replica map
498-
// TODO(maxcao13): Not sure, but do we need disruption tolerance for in-place updates? I guess we do since they are not guaranteed to be disruptionless...
499-
// TODO(maxcao13): If this is okay, I may have to rename evictionTolerance to disruption tolerance
489+
// TODO(maxcao13): May need to rename evictionTolerance to disruptionTolerance
500490
if present {
491+
// minimum number of pods that should be running to tolerate disruptions
492+
shouldBeAlive := singleGroupStats.configured - singleGroupStats.evictionTolerance
493+
// number of pods that are actually running
494+
actuallyAlive := singleGroupStats.running - (singleGroupStats.evicted + singleGroupStats.inPlaceUpdating)
501495
klog.V(4).InfoS("Checking pod disruption tolerance",
502496
"podName", pod.Name,
503497
"configuredPods", singleGroupStats.configured,
504498
"runningPods", singleGroupStats.running,
505499
"evictedPods", singleGroupStats.evicted,
506500
"inPlaceUpdatingPods", singleGroupStats.inPlaceUpdating,
507501
"evictionTolerance", singleGroupStats.evictionTolerance,
502+
"shouldBeAlive", shouldBeAlive,
503+
"actuallyAlive", actuallyAlive,
508504
)
509-
// minimum number of pods that should be running to tolerate disruptions
510-
shouldBeAlive := singleGroupStats.configured - singleGroupStats.evictionTolerance
511-
// number of pods that are actually running
512-
actuallyAlive := singleGroupStats.running - (singleGroupStats.evicted + singleGroupStats.inPlaceUpdating)
513505
if actuallyAlive > shouldBeAlive {
514506
klog.V(4).InfoS("Pod can be resized in-place; more pods are running than required", "podName", pod.Name, "shouldBeAlive", shouldBeAlive, "actuallyAlive", actuallyAlive)
515507
return true
@@ -533,16 +525,10 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
533525
func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa *vpa_types.VerticalPodAutoscaler, eventRecorder record.EventRecorder) error {
534526
cr, present := e.podToReplicaCreatorMap[GetPodID(podToUpdate)]
535527
if !present {
536-
return fmt.Errorf("pod not suitable for eviction %v : not in replicated pods map", podToUpdate.Name)
528+
return fmt.Errorf("pod not suitable for eviction %v: not in replicated pods map", podToUpdate.Name)
537529
}
538530

539-
// TODO(maxcao13): Not sure if we need to check again here, but commenting it out for now in case we do
540-
// if !e.CanInPlaceUpdate(podToUpdate) {
541-
// return fmt.Errorf("cannot update pod %v in place : number of in-flight updates exceeded", podToUpdate.Name)
542-
// }
543-
544531
// TODO(maxcao13): There's maybe a more efficient way to do this, but this is what we have for now
545-
546532
// separate patches since we have to patch resize and spec separately
547533
resourcePatches := []resource_updates.PatchRecord{}
548534
annotationPatches := []resource_updates.PatchRecord{}
@@ -552,7 +538,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
552538
for i, calculator := range e.patchCalculators {
553539
p, err := calculator.CalculatePatches(podToUpdate, vpa)
554540
if err != nil {
555-
return fmt.Errorf("failed to calculate resource patch for pod %s/%s: %v", podToUpdate.Namespace, podToUpdate.Name, err)
541+
return err
556542
}
557543
klog.V(4).InfoS("Calculated patches for pod", "pod", klog.KObj(podToUpdate), "patches", p)
558544
// TODO(maxcao13): change how this works later, this is gross and depends on the resource calculator being first in the slice
@@ -566,35 +552,28 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
566552
if len(resourcePatches) > 0 {
567553
patch, err := json.Marshal(resourcePatches)
568554
if err != nil {
569-
klog.Errorf("Cannot marshal the patch %v: %v", resourcePatches, err)
570555
return err
571556
}
572557

573558
res, err := e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{}, "resize")
574559
if err != nil {
575-
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
576560
return err
577561
}
578562
klog.V(4).InfoS("In-place patched pod /resize subresource using patches ", "pod", klog.KObj(res), "patches", string(patch))
579563

580-
// TODO(maxcao13): whether or not we apply annotation patches should depend on resource patches?
581564
if len(annotationPatches) > 0 {
582565
patch, err := json.Marshal(annotationPatches)
583566
if err != nil {
584-
klog.Errorf("Cannot marshal the patch %v: %v", annotationPatches, err)
585567
return err
586568
}
587569
res, err = e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{})
588570
if err != nil {
589-
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
590571
return err
591572
}
592573
klog.V(4).InfoS("Patched pod annotations", "pod", klog.KObj(res), "patches", string(patch))
593574
}
594575
} else {
595-
err := fmt.Errorf("no patches to apply to %s", podToUpdate.Name)
596-
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
597-
return err
576+
return fmt.Errorf("no resource patches were calculated to apply")
598577
}
599578

600579
// TODO(maxcao13): If this keeps getting called on the same object with the same reason, it is considered a patch request.
@@ -607,30 +586,19 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
607586
if podToUpdate.Status.Phase == apiv1.PodRunning {
608587
singleGroupStats, present := e.creatorToSingleGroupStatsMap[cr]
609588
if !present {
610-
klog.Errorf("Internal error - cannot find stats for replication group %v", cr)
589+
klog.InfoS("Internal error - cannot find stats for replication group", "pod", klog.KObj(podToUpdate), "podReplicaCreator", cr)
611590
} else {
612591
singleGroupStats.inPlaceUpdating = singleGroupStats.inPlaceUpdating + 1
613592
e.creatorToSingleGroupStatsMap[cr] = singleGroupStats
614593
}
615594
} else {
616-
klog.Warningf("I updated, but my pod phase was %s", podToUpdate.Status.Phase)
595+
klog.InfoS("Attempted to in-place update, but pod was not running", "pod", klog.KObj(podToUpdate), "phase", podToUpdate.Status.Phase)
617596
}
618597

619598
return nil
620599
}
621600

622601
// IsInPlaceUpdating checks whether or not the given pod is currently in the middle of an in-place update
623602
func IsInPlaceUpdating(podToCheck *apiv1.Pod) (isUpdating bool) {
624-
// If the pod is currently updating we need to tally that
625-
if podToCheck.Status.Resize != "" {
626-
klog.V(4).InfoS("Pod is currently resizing", "pod", klog.KObj(podToCheck), "status", podToCheck.Status.Resize)
627-
// Proposed -> Deferred -> InProgress, but what about Infeasible?
628-
if podToCheck.Status.Resize == apiv1.PodResizeStatusInfeasible {
629-
klog.V(4).InfoS("Resource proposal for pod is Infeasible, we're probably stuck like this until we evict", "pod", klog.KObj(podToCheck))
630-
} else if podToCheck.Status.Resize == apiv1.PodResizeStatusDeferred {
631-
klog.V(4).InfoS("Resource proposal for pod is Deferred, we're probably stuck like this until we evict", "pod", klog.KObj(podToCheck))
632-
}
633-
return true
634-
}
635-
return false
603+
return podToCheck.Status.Resize != ""
636604
}

0 commit comments

Comments
 (0)