Skip to content

Commit e739fd9

Browse files
committed
VPA: fix logs and cleanup TODOs according to review
Signed-off-by: Max Cao <[email protected]>
1 parent 4211102 commit e739fd9

File tree

8 files changed

+276
-237
lines changed

8 files changed

+276
-237
lines changed

vertical-pod-autoscaler/e2e/v1/actuation.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ var _ = ActuationSuiteE2eDescribe("Actuation", func() {
415415

416416
InstallVPA(f, vpaCRD)
417417

418-
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaEvictionTimeout.String()))
418+
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaInPlaceTimeout.String()))
419419
CheckNoContainersRestarted(f)
420420

421421
ginkgo.By("Waiting for pods to be evicted")
@@ -457,7 +457,7 @@ var _ = ActuationSuiteE2eDescribe("Actuation", func() {
457457

458458
InstallVPA(f, vpaCRD)
459459

460-
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaEvictionTimeout.String()))
460+
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaInPlaceTimeout.String()))
461461
CheckNoContainersRestarted(f)
462462

463463
ginkgo.By("Waiting for pods to be evicted")
@@ -499,7 +499,7 @@ var _ = ActuationSuiteE2eDescribe("Actuation", func() {
499499

500500
InstallVPA(f, vpaCRD)
501501

502-
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaEvictionTimeout.String()))
502+
ginkgo.By(fmt.Sprintf("Waiting for in-place update, hoping it won't happen, sleep for %s", VpaInPlaceTimeout.String()))
503503
CheckNoContainersRestarted(f)
504504

505505
ginkgo.By("Waiting for pods to be evicted")

vertical-pod-autoscaler/e2e/v1/common.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ const (
5353
// VpaEvictionTimeout is a timeout for VPA to restart a pod if there are no
5454
// mechanisms blocking it (for example PDB).
5555
VpaEvictionTimeout = 3 * time.Minute
56+
// VpaInPlaceTimeout is a timeout for the VPA to finish in-place resizing a
57+
// pod (time for vpa to request inplace -> InProgress -> done)
58+
VpaInPlaceTimeout = 2 * time.Minute
5659

5760
defaultHamsterReplicas = int32(3)
5861
defaultHamsterBackoffLimit = int32(10)
@@ -459,7 +462,7 @@ func CheckNoPodsEvicted(f *framework.Framework, initialPodSet PodSet) {
459462
// updating containers in-place and checks that no containers were restarted.
460463
func CheckNoContainersRestarted(f *framework.Framework) {
461464
var foundContainerRestarts int32
462-
time.Sleep(VpaEvictionTimeout)
465+
time.Sleep(VpaInPlaceTimeout)
463466
podList, err := GetHamsterPods(f)
464467
for _, pod := range podList.Items {
465468
containerRestarts := getContainerRestarts(pod.Status)

vertical-pod-autoscaler/e2e/v1/updater.go

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ var _ = UpdaterE2eDescribe("Updater", func() {
8181
ginkgo.By("Waiting for pods to be in-place updated")
8282

8383
//gomega.Expect(err).NotTo(gomega.HaveOccurred())
84+
// TODO(maxcao13): I don't think we need this much complexity for checking inplace, but I won't remove it for now
8485
err := WaitForPodsUpdatedWithoutEviction(f, initialPods, podList)
8586
gomega.Expect(err).NotTo(gomega.HaveOccurred())
8687
})
@@ -120,6 +121,45 @@ var _ = UpdaterE2eDescribe("Updater", func() {
120121
gomega.Expect(err).NotTo(gomega.HaveOccurred())
121122
})
122123

124+
ginkgo.It("does not in-place update pods when there is no recommendation", func() {
125+
const statusUpdateInterval = 10 * time.Second
126+
127+
ginkgo.By("Setting up the Admission Controller status")
128+
stopCh := make(chan struct{})
129+
statusUpdater := status.NewUpdater(
130+
f.ClientSet,
131+
status.AdmissionControllerStatusName,
132+
status.AdmissionControllerStatusNamespace,
133+
statusUpdateInterval,
134+
"e2e test",
135+
)
136+
defer func() {
137+
// Schedule a cleanup of the Admission Controller status.
138+
// Status is created outside the test namespace.
139+
ginkgo.By("Deleting the Admission Controller status")
140+
close(stopCh)
141+
err := f.ClientSet.CoordinationV1().Leases(status.AdmissionControllerStatusNamespace).
142+
Delete(context.TODO(), status.AdmissionControllerStatusName, metav1.DeleteOptions{})
143+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
144+
}()
145+
statusUpdater.Run(stopCh)
146+
147+
podList := setupPodsForUpscalingWithoutRecommendation(f)
148+
if len(podList.Items[0].Spec.Containers[0].ResizePolicy) <= 0 {
149+
// Feature is probably not working here
150+
ginkgo.Skip("Skipping test, InPlacePodVerticalScaling not available")
151+
}
152+
153+
ginkgo.By(fmt.Sprintf("Waiting for pods to be in-place updated, hoping it won't happen, sleep for %s", VpaInPlaceTimeout.String()))
154+
CheckNoContainersRestarted(f)
155+
156+
updatedPodList, err := GetHamsterPods(f)
157+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
158+
for _, pod := range updatedPodList.Items {
159+
gomega.Expect(getCPURequest(pod.Spec)).To(gomega.Equal(ParseQuantityOrDie("100m")))
160+
}
161+
})
162+
123163
ginkgo.It("evicts pods when Admission Controller status available", func() {
124164
const statusUpdateInterval = 10 * time.Second
125165

@@ -265,14 +305,18 @@ func setupPodsForEviction(f *framework.Framework, hamsterCPU, hamsterMemory stri
265305
}
266306

267307
func setupPodsForUpscalingInPlace(f *framework.Framework) *apiv1.PodList {
268-
return setupPodsForInPlace(f, "100m", "100Mi", nil)
308+
return setupPodsForInPlace(f, "100m", "100Mi", nil, true)
269309
}
270310

271311
func setupPodsForDownscalingInPlace(f *framework.Framework, er []*vpa_types.EvictionRequirement) *apiv1.PodList {
272-
return setupPodsForInPlace(f, "500m", "500Mi", er)
312+
return setupPodsForInPlace(f, "500m", "500Mi", er, true)
313+
}
314+
315+
func setupPodsForUpscalingWithoutRecommendation(f *framework.Framework) *apiv1.PodList {
316+
return setupPodsForInPlace(f, "100m", "100Mi", nil, false)
273317
}
274318

275-
func setupPodsForInPlace(f *framework.Framework, hamsterCPU, hamsterMemory string, er []*vpa_types.EvictionRequirement) *apiv1.PodList {
319+
func setupPodsForInPlace(f *framework.Framework, hamsterCPU, hamsterMemory string, er []*vpa_types.EvictionRequirement, withRecommendation bool) *apiv1.PodList {
276320
controller := &autoscaling.CrossVersionObjectReference{
277321
APIVersion: "apps/v1",
278322
Kind: "Deployment",
@@ -285,22 +329,25 @@ func setupPodsForInPlace(f *framework.Framework, hamsterCPU, hamsterMemory strin
285329

286330
ginkgo.By("Setting up a VPA CRD")
287331
containerName := GetHamsterContainerNameByIndex(0)
288-
vpaCRD := test.VerticalPodAutoscaler().
332+
vpaBuilder := test.VerticalPodAutoscaler().
289333
WithName("hamster-vpa").
290334
WithNamespace(f.Namespace.Name).
291335
WithTargetRef(controller).
292336
WithUpdateMode(vpa_types.UpdateModeInPlaceOrRecreate).
293337
WithEvictionRequirements(er).
294-
WithContainer(containerName).
295-
AppendRecommendation(
338+
WithContainer(containerName)
339+
340+
if withRecommendation {
341+
vpaBuilder = vpaBuilder.AppendRecommendation(
296342
test.Recommendation().
297343
WithContainer(containerName).
298344
WithTarget(containerName, "200m").
299345
WithLowerBound(containerName, "200m").
300346
WithUpperBound(containerName, "200m").
301-
GetContainerResources()).
302-
Get()
347+
GetContainerResources())
348+
}
303349

350+
vpaCRD := vpaBuilder.Get()
304351
InstallVPA(f, vpaCRD)
305352

306353
return podList

vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go

Lines changed: 55 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -123,20 +123,19 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool {
123123
// they might cause disruption. We assume pods will not be both in-place updated and evicted in the same pass, but
124124
// we need eviction to take the numbers into account so we don't violate our disruption dolerances.
125125
// If we're already resizing this pod, don't do anything to it, unless we failed to resize it, then we want to evict it.
126+
klog.V(4).InfoS("Pod disruption tolerance",
127+
"pod", klog.KObj(pod),
128+
"running", singleGroupStats.running,
129+
"configured", singleGroupStats.configured,
130+
"tolerance", singleGroupStats.evictionTolerance,
131+
"evicted", singleGroupStats.evicted,
132+
"updating", singleGroupStats.inPlaceUpdating)
126133
if IsInPlaceUpdating(pod) {
127-
klog.V(4).InfoS("Pod disruption tolerance",
128-
"pod", pod.Name,
129-
"running", singleGroupStats.running,
130-
"configured", singleGroupStats.configured,
131-
"tolerance", singleGroupStats.evictionTolerance,
132-
"evicted", singleGroupStats.evicted,
133-
"updating", singleGroupStats.inPlaceUpdating)
134-
135134
if singleGroupStats.running-(singleGroupStats.evicted+(singleGroupStats.inPlaceUpdating-1)) > shouldBeAlive {
136-
klog.V(4).Infof("Would be able to evict, but already resizing %s", pod.Name)
135+
klog.V(4).InfoS("Would be able to evict, but already resizing", "pod", klog.KObj(pod))
137136

138137
if pod.Status.Resize == apiv1.PodResizeStatusInfeasible || pod.Status.Resize == apiv1.PodResizeStatusDeferred {
139-
klog.Warningf("Attempted in-place resize of %s impossible, should now evict", pod.Name)
138+
klog.InfoS("Attempted in-place resize was impossible, should now evict", "pod", klog.KObj(pod), "resizePolicy", pod.Status.Resize)
140139
return true
141140
}
142141
}
@@ -441,74 +440,67 @@ func setUpInformer(kubeClient kube_client.Interface, kind controllerKind) (cache
441440

442441
// CanInPlaceUpdate performs the same checks
443442
func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
444-
445443
cr, present := e.podToReplicaCreatorMap[GetPodID(pod)]
446444
if present {
447-
448445
// If our QoS class is guaranteed, we can't change the resources without a restart
449-
// TODO(maxcao13): kubelet already prevents a resize of a guaranteed pod, so should we still check this early?
450446
if pod.Status.QOSClass == apiv1.PodQOSGuaranteed {
451-
klog.Warningf("Can't resize %s in-place, pod QoS is %s", pod.Name, pod.Status.QOSClass)
452447
return false
453448
}
454449

455-
// If we're already resizing this pod, don't do it again
456450
if IsInPlaceUpdating(pod) {
457-
klog.Warningf("Not resizing %s, already resizing it", pod.Name)
458451
return false
459452
}
460453

461454
noRestartPoliciesPopulated := true
455+
isPodRestartPolicyNever := pod.Spec.RestartPolicy == apiv1.RestartPolicyNever
462456

463457
for _, container := range pod.Spec.Containers {
464458
// If some of these are populated, we know it at least understands resizing
465459
if len(container.ResizePolicy) > 0 {
466460
noRestartPoliciesPopulated = false
467461
}
468462

469-
// TODO(maxcao13): Do we have to check the policy resource too? i.e. if only memory is getting scaled, then only check the memory resize policy?
470463
for _, policy := range container.ResizePolicy {
471464
if policy.RestartPolicy != apiv1.NotRequired {
472-
klog.Warningf("in-place resize of %s will cause container disruption, container %s restart policy is %v", pod.Name, container.Name, policy.RestartPolicy)
465+
klog.V(4).InfoS("in-place resize of pod will cause container disruption, because of container resize policy", "pod", klog.KObj(pod), "container", container.Name, "containerResizeRestartPolicy", policy.RestartPolicy)
473466
// TODO(jkyros): is there something that prevents this from happening elsewhere in the API?
474-
if pod.Spec.RestartPolicy == apiv1.RestartPolicyNever {
475-
klog.Warningf("in-place resize of %s not possible, container %s resize policy is %v but pod restartPolicy is %v", pod.Name, container.Name, policy.RestartPolicy, pod.Spec.RestartPolicy)
467+
if isPodRestartPolicyNever {
468+
klog.InfoS("in-place resize of pod not possible, container resize policy and pod restartPolicy conflict", "pod", klog.KObj(pod), "container", container.Name, "containerResizeRestartPolicy", policy.RestartPolicy, "podRestartPolicy", pod.Spec.RestartPolicy)
476469
return false
477470
}
478-
479471
}
480472
}
481473
}
482474

483475
// If none of the policies are populated, our feature is probably not enabled, so we can't in-place regardless
484476
if noRestartPoliciesPopulated {
485-
klog.Warningf("impossible to resize %s in-place, container resize policies are not populated", pod.Name)
477+
klog.InfoS("impossible to resize pod in-place, container resize policies are not populated", "pod", klog.KObj(pod))
486478
}
487479

488480
//TODO(jkyros): Come back and handle sidecar containers at some point since they're weird?
489481
singleGroupStats, present := e.creatorToSingleGroupStatsMap[cr]
490482
// If we're pending, we can't in-place resize
491483
// TODO(jkyros): are we sure we can't? Should I just set this to "if running"?
492484
if pod.Status.Phase == apiv1.PodPending {
493-
klog.V(4).Infof("Can't resize pending pod %s", pod.Name)
485+
klog.V(4).InfoS("Can't resize pending pod", "pod", klog.KObj(pod))
494486
return false
495487
}
496-
// This second "present" check is against the creator-to-group-stats map, not the pod-to-replica map
497-
// TODO(maxcao13): Not sure, but do we need disruption tolerance for in-place updates? I guess we do since they are not guaranteed to be disruptionless...
498-
// TODO(maxcao13): If this is okay, I may have to rename evictionTolerance to disruption tolerance
488+
// TODO(maxcao13): May need to rename evictionTolerance to disruptionTolerance
499489
if present {
490+
// minimum number of pods that should be running to tolerate disruptions
491+
shouldBeAlive := singleGroupStats.configured - singleGroupStats.evictionTolerance
492+
// number of pods that are actually running
493+
actuallyAlive := singleGroupStats.running - (singleGroupStats.evicted + singleGroupStats.inPlaceUpdating)
500494
klog.V(4).InfoS("Checking pod disruption tolerance",
501495
"podName", pod.Name,
502496
"configuredPods", singleGroupStats.configured,
503497
"runningPods", singleGroupStats.running,
504498
"evictedPods", singleGroupStats.evicted,
505499
"inPlaceUpdatingPods", singleGroupStats.inPlaceUpdating,
506500
"evictionTolerance", singleGroupStats.evictionTolerance,
501+
"shouldBeAlive", shouldBeAlive,
502+
"actuallyAlive", actuallyAlive,
507503
)
508-
// minimum number of pods that should be running to tolerate disruptions
509-
shouldBeAlive := singleGroupStats.configured - singleGroupStats.evictionTolerance
510-
// number of pods that are actually running
511-
actuallyAlive := singleGroupStats.running - (singleGroupStats.evicted + singleGroupStats.inPlaceUpdating)
512504
if actuallyAlive > shouldBeAlive {
513505
klog.V(4).InfoS("Pod can be resized in-place; more pods are running than required", "podName", pod.Name, "shouldBeAlive", shouldBeAlive, "actuallyAlive", actuallyAlive)
514506
return true
@@ -532,16 +524,10 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
532524
func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa *vpa_types.VerticalPodAutoscaler, eventRecorder record.EventRecorder) error {
533525
cr, present := e.podToReplicaCreatorMap[GetPodID(podToUpdate)]
534526
if !present {
535-
return fmt.Errorf("pod not suitable for eviction %v : not in replicated pods map", podToUpdate.Name)
527+
return fmt.Errorf("pod not suitable for eviction %v: not in replicated pods map", podToUpdate.Name)
536528
}
537529

538-
// TODO(maxcao13): Not sure if we need to check again here, but commenting it out for now in case we do
539-
// if !e.CanInPlaceUpdate(podToUpdate) {
540-
// return fmt.Errorf("cannot update pod %v in place : number of in-flight updates exceeded", podToUpdate.Name)
541-
// }
542-
543530
// TODO(maxcao13): There's maybe a more efficient way to do this, but this is what we have for now
544-
545531
// separate patches since we have to patch resize and spec separately
546532
resourcePatches := []resource_updates.PatchRecord{}
547533
annotationPatches := []resource_updates.PatchRecord{}
@@ -551,7 +537,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
551537
for i, calculator := range e.patchCalculators {
552538
p, err := calculator.CalculatePatches(podToUpdate, vpa)
553539
if err != nil {
554-
return fmt.Errorf("failed to calculate resource patch for pod %s/%s: %v", podToUpdate.Namespace, podToUpdate.Name, err)
540+
return err
555541
}
556542
klog.V(4).InfoS("Calculated patches for pod", "pod", klog.KObj(podToUpdate), "patches", p)
557543
// TODO(maxcao13): change how this works later, this is gross and depends on the resource calculator being first in the slice
@@ -565,35 +551,28 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
565551
if len(resourcePatches) > 0 {
566552
patch, err := json.Marshal(resourcePatches)
567553
if err != nil {
568-
klog.Errorf("Cannot marshal the patch %v: %v", resourcePatches, err)
569554
return err
570555
}
571556

572557
res, err := e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{}, "resize")
573558
if err != nil {
574-
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
575559
return err
576560
}
577561
klog.V(4).InfoS("In-place patched pod /resize subresource using patches ", "pod", klog.KObj(res), "patches", string(patch))
578562

579-
// TODO(maxcao13): whether or not we apply annotation patches should depend on resource patches?
580563
if len(annotationPatches) > 0 {
581564
patch, err := json.Marshal(annotationPatches)
582565
if err != nil {
583-
klog.Errorf("Cannot marshal the patch %v: %v", annotationPatches, err)
584566
return err
585567
}
586568
res, err = e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{})
587569
if err != nil {
588-
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
589570
return err
590571
}
591572
klog.V(4).InfoS("Patched pod annotations", "pod", klog.KObj(res), "patches", string(patch))
592573
}
593574
} else {
594-
err := fmt.Errorf("no patches to apply to %s", podToUpdate.Name)
595-
klog.ErrorS(err, "Failed to patch pod", "pod", klog.KObj(podToUpdate))
596-
return err
575+
return fmt.Errorf("no resource patches were calculated to apply")
597576
}
598577

599578
// TODO(maxcao13): If this keeps getting called on the same object with the same reason, it is considered a patch request.
@@ -606,30 +585,47 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
606585
if podToUpdate.Status.Phase == apiv1.PodRunning {
607586
singleGroupStats, present := e.creatorToSingleGroupStatsMap[cr]
608587
if !present {
609-
klog.Errorf("Internal error - cannot find stats for replication group %v", cr)
588+
klog.InfoS("Internal error - cannot find stats for replication group", "pod", klog.KObj(podToUpdate), "podReplicaCreator", cr)
610589
} else {
611590
singleGroupStats.inPlaceUpdating = singleGroupStats.inPlaceUpdating + 1
612591
e.creatorToSingleGroupStatsMap[cr] = singleGroupStats
613592
}
614593
} else {
615-
klog.Warningf("I updated, but my pod phase was %s", podToUpdate.Status.Phase)
594+
klog.InfoS("Attempted to in-place update, but pod was not running", "pod", klog.KObj(podToUpdate), "phase", podToUpdate.Status.Phase)
616595
}
617596

618597
return nil
619598
}
620599

621600
// IsInPlaceUpdating checks whether or not the given pod is currently in the middle of an in-place update
622601
func IsInPlaceUpdating(podToCheck *apiv1.Pod) (isUpdating bool) {
623-
// If the pod is currently updating we need to tally that
624-
if podToCheck.Status.Resize != "" {
625-
klog.V(4).InfoS("Pod is currently resizing", "pod", klog.KObj(podToCheck), "status", podToCheck.Status.Resize)
626-
// Proposed -> Deferred -> InProgress, but what about Infeasible?
627-
if podToCheck.Status.Resize == apiv1.PodResizeStatusInfeasible {
628-
klog.V(4).InfoS("Resource proposal for pod is Infeasible, we're probably stuck like this until we evict", "pod", klog.KObj(podToCheck))
629-
} else if podToCheck.Status.Resize == apiv1.PodResizeStatusDeferred {
630-
klog.V(4).InfoS("Resource proposal for pod is Deferred, we're probably stuck like this until we evict", "pod", klog.KObj(podToCheck))
631-
}
632-
return true
602+
return podToCheck.Status.Resize != ""
603+
}
604+
605+
// UpdatablePod is a Pod wrapper that record helpful attributes of a Pod
606+
// to be used when attempting to evict or in-place update.
607+
type UpdatablePod struct {
608+
pod *apiv1.Pod
609+
disruptionless bool
610+
}
611+
612+
// Pod returns the underlying pod
613+
func (p *UpdatablePod) Pod() *apiv1.Pod {
614+
return p.pod
615+
}
616+
617+
// IsDisruptionless returns the disruptionless status of the updatablePod
618+
func (p *UpdatablePod) IsDisruptionless() bool {
619+
return p.disruptionless
620+
}
621+
622+
// SetDisruptionless sets the disruptionless status of the updatablePod to true
623+
func (p *UpdatablePod) SetDisruptionless() {
624+
p.disruptionless = true
625+
}
626+
627+
func NewUpdatablePod(pod *apiv1.Pod) *UpdatablePod {
628+
return &UpdatablePod{
629+
pod: pod,
633630
}
634-
return false
635631
}

0 commit comments

Comments
 (0)