Skip to content

Commit 53d4006

Browse files
committed
VPA: fix logs and cleanup TODOs according to review
Signed-off-by: Max Cao <[email protected]>
1 parent e406979 commit 53d4006

File tree

4 files changed

+41
-35
lines changed

4 files changed

+41
-35
lines changed

vertical-pod-autoscaler/pkg/updater/eviction/pods_eviction_restriction.go

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -124,18 +124,18 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool {
124124
// If we're already resizing this pod, don't do anything to it, unless we failed to resize it, then we want to evict it.
125125
if IsInPlaceUpdating(pod) {
126126
klog.V(4).InfoS("Pod disruption tolerance",
127-
"pod", pod.Name,
127+
"pod", klog.KObj(pod),
128128
"running", singleGroupStats.running,
129129
"configured", singleGroupStats.configured,
130130
"tolerance", singleGroupStats.evictionTolerance,
131131
"evicted", singleGroupStats.evicted,
132132
"updating", singleGroupStats.inPlaceUpdating)
133133

134134
if singleGroupStats.running-(singleGroupStats.evicted+(singleGroupStats.inPlaceUpdating-1)) > shouldBeAlive {
135-
klog.V(4).Infof("Would be able to evict, but already resizing %s", pod.Name)
135+
klog.V(4).InfoS("Would be able to evict, but already resizing", "pod", klog.KObj(pod))
136136

137137
if pod.Status.Resize == apiv1.PodResizeStatusInfeasible || pod.Status.Resize == apiv1.PodResizeStatusDeferred {
138-
klog.Warningf("Attempted in-place resize of %s impossible, should now evict", pod.Name)
138+
klog.InfoS("Attempted in-place resize was impossible, should now evict", "pod", klog.KObj(pod))
139139
return true
140140
}
141141
}
@@ -447,13 +447,13 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
447447
// If our QoS class is guaranteed, we can't change the resources without a restart
448448
// TODO(maxcao13): kubelet already prevents a resize of a guaranteed pod, so should we still check this early?
449449
if pod.Status.QOSClass == apiv1.PodQOSGuaranteed {
450-
klog.Warningf("Can't resize %s in-place, pod QoS is %s", pod.Name, pod.Status.QOSClass)
450+
klog.V(4).InfoS("Can't resize pod in-place", "pod", klog.KObj(pod), "qosClass", pod.Status.QOSClass)
451451
return false
452452
}
453453

454454
// If we're already resizing this pod, don't do it again
455455
if IsInPlaceUpdating(pod) {
456-
klog.Warningf("Not resizing %s, already resizing it", pod.Name)
456+
klog.V(4).InfoS("Not resizing pod again, because we are already resizing it", "pod", klog.KObj(pod))
457457
return false
458458
}
459459

@@ -468,7 +468,7 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
468468
// TODO(maxcao13): Do we have to check the policy resource too? i.e. if only memory is getting scaled, then only check the memory resize policy?
469469
for _, policy := range container.ResizePolicy {
470470
if policy.RestartPolicy != apiv1.NotRequired {
471-
klog.Warningf("in-place resize of %s will cause container disruption, container %s restart policy is %v", pod.Name, container.Name, policy.RestartPolicy)
471+
klog.InfoS("in-place resize of pod will cause container disruption, because of container resize policy", "pod", klog.KObj(pod), "container", container.Name, "restartPolicy", policy.RestartPolicy)
472472
// TODO(jkyros): is there something that prevents this from happening elsewhere in the API?
473473
if pod.Spec.RestartPolicy == apiv1.RestartPolicyNever {
474474
klog.Warningf("in-place resize of %s not possible, container %s resize policy is %v but pod restartPolicy is %v", pod.Name, container.Name, policy.RestartPolicy, pod.Spec.RestartPolicy)
@@ -481,15 +481,15 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
481481

482482
// If none of the policies are populated, our feature is probably not enabled, so we can't in-place regardless
483483
if noRestartPoliciesPopulated {
484-
klog.Warningf("impossible to resize %s in-place, container resize policies are not populated", pod.Name)
484+
klog.InfoS("impossible to resize pod in-place, container resize policies are not populated", "pod", klog.KObj(pod))
485485
}
486486

487487
//TODO(jkyros): Come back and handle sidecar containers at some point since they're weird?
488488
singleGroupStats, present := e.creatorToSingleGroupStatsMap[cr]
489489
// If we're pending, we can't in-place resize
490490
// TODO(jkyros): are we sure we can't? Should I just set this to "if running"?
491491
if pod.Status.Phase == apiv1.PodPending {
492-
klog.V(4).Infof("Can't resize pending pod %s", pod.Name)
492+
klog.V(4).InfoS("Can't resize pending pod", "pod", klog.KObj(pod))
493493
return false
494494
}
495495
// This second "present" check is against the creator-to-group-stats map, not the pod-to-replica map
@@ -550,7 +550,8 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
550550
for i, calculator := range e.patchCalculators {
551551
p, err := calculator.CalculatePatches(podToUpdate, vpa)
552552
if err != nil {
553-
return fmt.Errorf("failed to calculate resource patch for pod %s/%s: %v", podToUpdate.Namespace, podToUpdate.Name, err)
553+
klog.ErrorS(err, "failed to calculate resource patch for pod", "pod", klog.KObj(podToUpdate))
554+
return err
554555
}
555556
klog.V(4).InfoS("Calculated patches for pod", "pod", klog.KObj(podToUpdate), "patches", p)
556557
// TODO(maxcao13): change how this works later, this is gross and depends on the resource calculator being first in the slice
@@ -564,7 +565,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
564565
if len(resourcePatches) > 0 {
565566
patch, err := json.Marshal(resourcePatches)
566567
if err != nil {
567-
klog.Errorf("Cannot marshal the patch %v: %v", resourcePatches, err)
568+
klog.ErrorS(err, "Cannot marshal the patch %v", resourcePatches)
568569
return err
569570
}
570571

@@ -579,7 +580,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
579580
if len(annotationPatches) > 0 {
580581
patch, err := json.Marshal(annotationPatches)
581582
if err != nil {
582-
klog.Errorf("Cannot marshal the patch %v: %v", annotationPatches, err)
583+
klog.ErrorS(err, "Cannot marshal the patch %v", annotationPatches)
583584
return err
584585
}
585586
res, err = e.client.CoreV1().Pods(podToUpdate.Namespace).Patch(context.TODO(), podToUpdate.Name, k8stypes.JSONPatchType, patch, metav1.PatchOptions{})
@@ -611,7 +612,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
611612
e.creatorToSingleGroupStatsMap[cr] = singleGroupStats
612613
}
613614
} else {
614-
klog.Warningf("I updated, but my pod phase was %s", podToUpdate.Status.Phase)
615+
klog.InfoS("Attempted to update", "pod", klog.KObj(podToUpdate), "phase", podToUpdate.Status.Phase)
615616
}
616617

617618
return nil

vertical-pod-autoscaler/pkg/updater/logic/updater.go

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -247,20 +247,20 @@ func (u *updater) RunOnce(ctx context.Context) {
247247

248248
fallBackToEviction, err := u.AttemptInPlaceScalingIfPossible(ctx, vpaSize, vpa, pod, evictionLimiter, vpasWithInPlaceUpdatablePodsCounter, vpasWithInPlaceUpdatedPodsCounter)
249249
if err != nil {
250-
klog.Warningf("error attemptng to scale pod %v in-place: %v", pod.Name, err)
250+
klog.ErrorS(err, "error attemptng to scale pod in-place", "pod", klog.KObj(pod))
251251
return
252252
}
253253
// If in-place scaling was possible, and it isn't stuck, then skip eviction
254254
if fallBackToEviction {
255255
// TODO(jkyros): this needs to be cleaner, but we absolutely need to make sure a disruptionless update doesn't "sneak through"
256256
if prioritizedPod.IsDisruptionless() {
257-
klog.Infof("Not falling back to eviction, %v was supposed to be disruptionless", pod.Name)
257+
klog.InfoS("Not falling back to eviction, pod was supposed to be disruptionless", "pod", klog.KObj(pod))
258258
continue
259259
} else {
260-
klog.V(4).Infof("Falling back to eviction for %s", pod.Name)
260+
klog.V(4).InfoS("Falling back to eviction for pod", "pod", klog.KObj(pod))
261261
}
262262
} else {
263-
klog.Infof("Not falling back to eviction, because we don't have a recommendation yet? %v", pod.Name)
263+
klog.InfoS("Not falling back to eviction, because we don't have a recommendation yet?", "pod", klog.KObj(pod))
264264
continue
265265
}
266266

@@ -392,7 +392,7 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
392392
// queue this for in-place before the VPA has made a recommendation.
393393

394394
if !VpaRecommendationProvided(vpa) {
395-
klog.V(4).Infof("VPA hasn't made a recommendation yet, we're early on %s for some reason", pod.Name)
395+
klog.V(4).InfoS("VPA hasn't made a recommendation for the pod yet", "pod", klog.KObj(pod))
396396
// TODO(jkyros): so we must have had some erroneous evictions before, but we were passing the test suite? But for now if I want to test
397397
// in-place I need it to not evict immediately if I can't in-place (because then it will never in-place)
398398
fallBackToEviction = false
@@ -405,22 +405,22 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
405405
withInPlaceUpdatable := false
406406
withInPlaceUpdated := false
407407

408-
klog.V(4).Infof("Looks like we might be able to in-place update %s..", pod.Name)
408+
klog.V(4).InfoS("Looks like we might be able to in-place update pod...", "pod", klog.KObj(pod))
409409
withInPlaceUpdatable = true
410410
// If I can't update
411411
if !evictionLimiter.CanInPlaceUpdate(pod) {
412412
// If we are already updating, wait for the next loop to progress
413413
if !eviction.IsInPlaceUpdating(pod) {
414414
// But it's not in-place updating, something went wrong (e.g. the operation would change pods QoS)
415415
// fall back to eviction
416-
klog.V(4).Infof("Can't in-place update pod %s, falling back to eviction, it might say no", pod.Name)
416+
klog.V(4).InfoS("Can't in-place update pod, falling back to eviction, it might say no", "pod", klog.KObj(pod))
417417
fallBackToEviction = true
418418
return
419419
}
420420

421421
lastUpdateTime, exists := u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)]
422422
if !exists {
423-
klog.V(4).Infof("In-place update in progress for %s/%s, but no lastInPlaceUpdateTime found, setting it to now", pod.Namespace, pod.Name)
423+
klog.V(4).InfoS("In-place update in progress for pod but no lastInPlaceUpdateTime found, setting it to now", "pod", klog.KObj(pod))
424424
lastUpdateTime = time.Now()
425425
u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] = lastUpdateTime
426426
}
@@ -432,23 +432,23 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
432432
switch pod.Status.Resize {
433433
case apiv1.PodResizeStatusDeferred:
434434
if time.Since(lastUpdateTime) > DeferredResizeUpdateTimeout {
435-
klog.V(4).InfoS(fmt.Sprintf("In-place update deferred for more than %v, falling back to eviction", DeferredResizeUpdateTimeout), "pod", pod.Name)
435+
klog.V(4).InfoS(fmt.Sprintf("In-place update deferred for more than %v, falling back to eviction", DeferredResizeUpdateTimeout), "pod", klog.KObj(pod))
436436
fallBackToEviction = true
437437
} else {
438-
klog.V(4).Infof("In-place update deferred for %s, NOT falling back to eviction yet", pod.Name)
438+
klog.V(4).InfoS("In-place update deferred, NOT falling back to eviction yet", "pod", klog.KObj(pod))
439439
}
440440
case apiv1.PodResizeStatusInProgress:
441441
if time.Since(lastUpdateTime) > InProgressResizeUpdateTimeout {
442-
klog.V(4).InfoS(fmt.Sprintf("In-place update in progress for more than %v, falling back to eviction", InProgressResizeUpdateTimeout), "pod", pod.Name)
442+
klog.V(4).InfoS(fmt.Sprintf("In-place update in progress for more than %v, falling back to eviction", InProgressResizeUpdateTimeout), "pod", klog.KObj(pod))
443443
fallBackToEviction = true
444444
} else {
445-
klog.V(4).InfoS("In-place update in progress, NOT falling back to eviction yet", "pod", pod.Name)
445+
klog.V(4).InfoS("In-place update in progress, NOT falling back to eviction yet", "pod", klog.KObj(pod))
446446
}
447447
case apiv1.PodResizeStatusInfeasible:
448-
klog.V(4).InfoS("In-place update infeasible, falling back to eviction", "pod", pod.Name)
448+
klog.V(4).InfoS("In-place update infeasible, falling back to eviction", "pod", klog.KObj(pod))
449449
fallBackToEviction = true
450450
default:
451-
klog.V(4).InfoS("In-place update status unknown, falling back to eviction", "pod", pod.Name)
451+
klog.V(4).InfoS("In-place update status unknown, falling back to eviction", "pod", klog.KObj(pod))
452452
fallBackToEviction = true
453453
}
454454
return
@@ -458,15 +458,15 @@ func (u *updater) AttemptInPlaceScalingIfPossible(ctx context.Context, vpaSize i
458458
err = u.evictionRateLimiter.Wait(ctx)
459459
if err != nil {
460460
// TODO(jkyros): whether or not we fall back to eviction here probably depends on *why* we failed
461-
klog.Warningf("updating pod %v failed: %v", pod.Name, err)
461+
klog.ErrorS(err, "updating pod failed", "pod", klog.KObj(pod))
462462
return
463463
}
464464

465-
klog.V(2).Infof("attempting to in-place update pod %v", pod.Name)
465+
klog.V(2).InfoS("attempting to in-place update pod", "pod", klog.KObj(pod))
466466
u.lastInPlaceUpdateAttemptTimeMap[eviction.GetPodID(pod)] = time.Now()
467467
evictErr := evictionLimiter.InPlaceUpdate(pod, vpa, u.eventRecorder)
468468
if evictErr != nil {
469-
klog.Warningf("updating pod %v failed: %v", pod.Name, evictErr)
469+
klog.ErrorS(evictErr, "updating pod failed", "pod", klog.KObj(pod))
470470
} else {
471471
// TODO(jkyros): come back later for stats
472472
withInPlaceUpdated = false

vertical-pod-autoscaler/pkg/updater/priority/priority_processor.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ func (*defaultPriorityProcessor) GetUpdatePriority(pod *apiv1.Pod, vpa *vpa_type
8989
// It's okay if we're actually still resizing, but if we can't now or we're stuck, make sure the pod
9090
// is still in the list so we can evict it to go live on a fatter node or something
9191
if pod.Status.Resize == apiv1.PodResizeStatusDeferred || pod.Status.Resize == apiv1.PodResizeStatusInfeasible {
92-
klog.V(4).Infof("Pod %s looks like it's stuck scaling up (%v state), leaving it in for eviction", pod.Name, pod.Status.Resize)
92+
klog.V(4).InfoS("Pod looks like it's stuck scaling up, leaving it in for eviction", "pod", klog.KObj(pod), "resizeStatus", pod.Status.Resize)
9393
} else {
94-
klog.V(4).Infof("Pod %s is in the process of scaling up (%v state), leaving it in so we can see if it's taking too long", pod.Name, pod.Status.Resize)
94+
klog.V(4).InfoS("Pod is in the process of scaling up, leaving it in so we can see if it's taking too long", "pod", klog.KObj(pod), "resizeStatus", pod.Status.Resize)
9595
}
9696
}
9797
// I guess if it's not outside of compliance, it's probably okay it's stuck here?

vertical-pod-autoscaler/pkg/updater/priority/update_priority_calculator.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,15 +138,19 @@ func (calc *UpdatePriorityCalculator) AddPod(pod *apiv1.Pod, now time.Time) {
138138
// TODO(jkyros): do we need an in-place update threshold arg ?
139139
// If our recommendations are disruptionless, we can bypass the threshold limit
140140
if len(disruptionlessRecommendation.ContainerRecommendations) > 0 {
141-
klog.V(2).Infof("Short-lived, but pod still accepted for DISRUPTIONLESS (%d/%d) in-place update %v/%v with priority %v", len(disruptionlessRecommendation.ContainerRecommendations), len(processedRecommendation.ContainerRecommendations), pod.Namespace, pod.Name, updatePriority.ResourceDiff)
141+
klog.V(2).InfoS("Short-lived, but pod still accepted for disruptionless in-place update",
142+
"pod", klog.KObj(pod),
143+
"numContainers", len(pod.Spec.Containers),
144+
"resourceDiff", updatePriority.ResourceDiff,
145+
"fractionOfDisruptionlessRecommendations", len(disruptionlessRecommendation.ContainerRecommendations)/len(processedRecommendation.ContainerRecommendations),
146+
)
142147
updatePriority.Disruptionless = true
143148
calc.pods = append(calc.pods, PrioritizedPod{
144149
pod: pod,
145150
priority: updatePriority,
146151
recommendation: disruptionlessRecommendation})
147152
} else {
148-
// if it's not disruptionless, we fallback to the Recreate conditions which already failed
149-
// (quick oom, outside recommended range, long-lived + significant change), so don't update this pod
153+
// we cannot perform this update disruption-free, so do not update this pod's resources
150154
klog.V(4).InfoS("Not updating a short-lived pod, request within recommended range", "pod", klog.KObj(pod))
151155
}
152156
return
@@ -176,7 +180,7 @@ func (calc *UpdatePriorityCalculator) GetSortedPrioritizedPods(admission PodEvic
176180
if admission.Admit(podPrio.pod, podPrio.recommendation) {
177181
result = append(result, &calc.pods[num])
178182
} else {
179-
klog.V(2).Infof("pod removed from update queue by PodEvictionAdmission: %v", podPrio.pod.Name)
183+
klog.V(2).InfoS("Pod removed from update queue by PodEvictionAdmission", "pod", klog.KObj(podPrio.pod))
180184
}
181185
}
182186

@@ -325,6 +329,7 @@ func (calc *UpdatePriorityCalculator) CalculateDisruptionFreeActions(pod *apiv1.
325329
resourceRestartPolicy := getRestartPolicyForResource(resource, container.ResizePolicy)
326330
// If we don't have one, that's probably bad
327331
if resourceRestartPolicy == nil {
332+
klog.V(4).InfoS("Container does not have a resourceResizeRestartPolicy", "pod", klog.KObj(pod), "container", container.Name)
328333
continue
329334
}
330335
// If we do have one, and it's disruptive, then we know this won't work

0 commit comments

Comments
 (0)