@@ -123,20 +123,19 @@ func (e *podsEvictionRestrictionImpl) CanEvict(pod *apiv1.Pod) bool {
123123 // they might cause disruption. We assume pods will not be both in-place updated and evicted in the same pass, but
124124 // we need eviction to take the numbers into account so we don't violate our disruption dolerances.
125125 // If we're already resizing this pod, don't do anything to it, unless we failed to resize it, then we want to evict it.
126+ klog .V (4 ).InfoS ("Pod disruption tolerance" ,
127+ "pod" , klog .KObj (pod ),
128+ "running" , singleGroupStats .running ,
129+ "configured" , singleGroupStats .configured ,
130+ "tolerance" , singleGroupStats .evictionTolerance ,
131+ "evicted" , singleGroupStats .evicted ,
132+ "updating" , singleGroupStats .inPlaceUpdating )
126133 if IsInPlaceUpdating (pod ) {
127- klog .V (4 ).InfoS ("Pod disruption tolerance" ,
128- "pod" , pod .Name ,
129- "running" , singleGroupStats .running ,
130- "configured" , singleGroupStats .configured ,
131- "tolerance" , singleGroupStats .evictionTolerance ,
132- "evicted" , singleGroupStats .evicted ,
133- "updating" , singleGroupStats .inPlaceUpdating )
134-
135134 if singleGroupStats .running - (singleGroupStats .evicted + (singleGroupStats .inPlaceUpdating - 1 )) > shouldBeAlive {
136- klog .V (4 ).Infof ("Would be able to evict, but already resizing %s " , pod . Name )
135+ klog .V (4 ).InfoS ("Would be able to evict, but already resizing" , " pod" , klog . KObj ( pod ) )
137136
138137 if pod .Status .Resize == apiv1 .PodResizeStatusInfeasible || pod .Status .Resize == apiv1 .PodResizeStatusDeferred {
139- klog .Warningf ("Attempted in-place resize of %s impossible, should now evict" , pod . Name )
138+ klog .InfoS ("Attempted in-place resize was impossible, should now evict" , " pod" , klog . KObj ( pod ), "resizePolicy" , pod . Status . Resize )
140139 return true
141140 }
142141 }
@@ -441,74 +440,67 @@ func setUpInformer(kubeClient kube_client.Interface, kind controllerKind) (cache
441440
442441// CanInPlaceUpdate performs the same checks
443442func (e * podsEvictionRestrictionImpl ) CanInPlaceUpdate (pod * apiv1.Pod ) bool {
444-
445443 cr , present := e .podToReplicaCreatorMap [GetPodID (pod )]
446444 if present {
447-
448445 // If our QoS class is guaranteed, we can't change the resources without a restart
449- // TODO(maxcao13): kubelet already prevents a resize of a guaranteed pod, so should we still check this early?
450446 if pod .Status .QOSClass == apiv1 .PodQOSGuaranteed {
451- klog .Warningf ("Can't resize %s in-place, pod QoS is %s" , pod .Name , pod .Status .QOSClass )
452447 return false
453448 }
454449
455- // If we're already resizing this pod, don't do it again
456450 if IsInPlaceUpdating (pod ) {
457- klog .Warningf ("Not resizing %s, already resizing it" , pod .Name )
458451 return false
459452 }
460453
461454 noRestartPoliciesPopulated := true
455+ isPodRestartPolicyNever := pod .Spec .RestartPolicy == apiv1 .RestartPolicyNever
462456
463457 for _ , container := range pod .Spec .Containers {
464458 // If some of these are populated, we know it at least understands resizing
465459 if len (container .ResizePolicy ) > 0 {
466460 noRestartPoliciesPopulated = false
467461 }
468462
469- // TODO(maxcao13): Do we have to check the policy resource too? i.e. if only memory is getting scaled, then only check the memory resize policy?
470463 for _ , policy := range container .ResizePolicy {
471464 if policy .RestartPolicy != apiv1 .NotRequired {
472- klog .Warningf ( "in-place resize of %s will cause container disruption, container %s restart policy is %v " , pod . Name , container .Name , policy .RestartPolicy )
465+ klog .V ( 4 ). InfoS ( "in-place resize of pod will cause container disruption, because of container resize policy " , " pod" , klog . KObj ( pod ), " container" , container .Name , "containerResizeRestartPolicy" , policy .RestartPolicy )
473466 // TODO(jkyros): is there something that prevents this from happening elsewhere in the API?
474- if pod . Spec . RestartPolicy == apiv1 . RestartPolicyNever {
475- klog .Warningf ("in-place resize of %s not possible, container %s resize policy is %v but pod restartPolicy is %v " , pod . Name , container .Name , policy .RestartPolicy , pod .Spec .RestartPolicy )
467+ if isPodRestartPolicyNever {
468+ klog .InfoS ("in-place resize of pod not possible, container resize policy and pod restartPolicy conflict " , " pod" , klog . KObj ( pod ), " container" , container .Name , "containerResizeRestartPolicy" , policy .RestartPolicy , "podRestartPolicy" , pod .Spec .RestartPolicy )
476469 return false
477470 }
478-
479471 }
480472 }
481473 }
482474
483475 // If none of the policies are populated, our feature is probably not enabled, so we can't in-place regardless
484476 if noRestartPoliciesPopulated {
485- klog .Warningf ("impossible to resize %s in-place, container resize policies are not populated" , pod . Name )
477+ klog .InfoS ("impossible to resize pod in-place, container resize policies are not populated" , " pod" , klog . KObj ( pod ) )
486478 }
487479
488480 //TODO(jkyros): Come back and handle sidecar containers at some point since they're weird?
489481 singleGroupStats , present := e .creatorToSingleGroupStatsMap [cr ]
490482 // If we're pending, we can't in-place resize
491483 // TODO(jkyros): are we sure we can't? Should I just set this to "if running"?
492484 if pod .Status .Phase == apiv1 .PodPending {
493- klog .V (4 ).Infof ("Can't resize pending pod %s " , pod . Name )
485+ klog .V (4 ).InfoS ("Can't resize pending pod" , " pod" , klog . KObj ( pod ) )
494486 return false
495487 }
496- // This second "present" check is against the creator-to-group-stats map, not the pod-to-replica map
497- // TODO(maxcao13): Not sure, but do we need disruption tolerance for in-place updates? I guess we do since they are not guaranteed to be disruptionless...
498- // TODO(maxcao13): If this is okay, I may have to rename evictionTolerance to disruption tolerance
488+ // TODO(maxcao13): May need to rename evictionTolerance to disruptionTolerance
499489 if present {
490+ // minimum number of pods that should be running to tolerate disruptions
491+ shouldBeAlive := singleGroupStats .configured - singleGroupStats .evictionTolerance
492+ // number of pods that are actually running
493+ actuallyAlive := singleGroupStats .running - (singleGroupStats .evicted + singleGroupStats .inPlaceUpdating )
500494 klog .V (4 ).InfoS ("Checking pod disruption tolerance" ,
501495 "podName" , pod .Name ,
502496 "configuredPods" , singleGroupStats .configured ,
503497 "runningPods" , singleGroupStats .running ,
504498 "evictedPods" , singleGroupStats .evicted ,
505499 "inPlaceUpdatingPods" , singleGroupStats .inPlaceUpdating ,
506500 "evictionTolerance" , singleGroupStats .evictionTolerance ,
501+ "shouldBeAlive" , shouldBeAlive ,
502+ "actuallyAlive" , actuallyAlive ,
507503 )
508- // minimum number of pods that should be running to tolerate disruptions
509- shouldBeAlive := singleGroupStats .configured - singleGroupStats .evictionTolerance
510- // number of pods that are actually running
511- actuallyAlive := singleGroupStats .running - (singleGroupStats .evicted + singleGroupStats .inPlaceUpdating )
512504 if actuallyAlive > shouldBeAlive {
513505 klog .V (4 ).InfoS ("Pod can be resized in-place; more pods are running than required" , "podName" , pod .Name , "shouldBeAlive" , shouldBeAlive , "actuallyAlive" , actuallyAlive )
514506 return true
@@ -532,16 +524,10 @@ func (e *podsEvictionRestrictionImpl) CanInPlaceUpdate(pod *apiv1.Pod) bool {
532524func (e * podsEvictionRestrictionImpl ) InPlaceUpdate (podToUpdate * apiv1.Pod , vpa * vpa_types.VerticalPodAutoscaler , eventRecorder record.EventRecorder ) error {
533525 cr , present := e .podToReplicaCreatorMap [GetPodID (podToUpdate )]
534526 if ! present {
535- return fmt .Errorf ("pod not suitable for eviction %v : not in replicated pods map" , podToUpdate .Name )
527+ return fmt .Errorf ("pod not suitable for eviction %v: not in replicated pods map" , podToUpdate .Name )
536528 }
537529
538- // TODO(maxcao13): Not sure if we need to check again here, but commenting it out for now in case we do
539- // if !e.CanInPlaceUpdate(podToUpdate) {
540- // return fmt.Errorf("cannot update pod %v in place : number of in-flight updates exceeded", podToUpdate.Name)
541- // }
542-
543530 // TODO(maxcao13): There's maybe a more efficient way to do this, but this is what we have for now
544-
545531 // separate patches since we have to patch resize and spec separately
546532 resourcePatches := []resource_updates.PatchRecord {}
547533 annotationPatches := []resource_updates.PatchRecord {}
@@ -551,7 +537,7 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
551537 for i , calculator := range e .patchCalculators {
552538 p , err := calculator .CalculatePatches (podToUpdate , vpa )
553539 if err != nil {
554- return fmt . Errorf ( "failed to calculate resource patch for pod %s/%s: %v" , podToUpdate . Namespace , podToUpdate . Name , err )
540+ return err
555541 }
556542 klog .V (4 ).InfoS ("Calculated patches for pod" , "pod" , klog .KObj (podToUpdate ), "patches" , p )
557543 // TODO(maxcao13): change how this works later, this is gross and depends on the resource calculator being first in the slice
@@ -565,35 +551,28 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
565551 if len (resourcePatches ) > 0 {
566552 patch , err := json .Marshal (resourcePatches )
567553 if err != nil {
568- klog .Errorf ("Cannot marshal the patch %v: %v" , resourcePatches , err )
569554 return err
570555 }
571556
572557 res , err := e .client .CoreV1 ().Pods (podToUpdate .Namespace ).Patch (context .TODO (), podToUpdate .Name , k8stypes .JSONPatchType , patch , metav1.PatchOptions {}, "resize" )
573558 if err != nil {
574- klog .ErrorS (err , "Failed to patch pod" , "pod" , klog .KObj (podToUpdate ))
575559 return err
576560 }
577561 klog .V (4 ).InfoS ("In-place patched pod /resize subresource using patches " , "pod" , klog .KObj (res ), "patches" , string (patch ))
578562
579- // TODO(maxcao13): whether or not we apply annotation patches should depend on resource patches?
580563 if len (annotationPatches ) > 0 {
581564 patch , err := json .Marshal (annotationPatches )
582565 if err != nil {
583- klog .Errorf ("Cannot marshal the patch %v: %v" , annotationPatches , err )
584566 return err
585567 }
586568 res , err = e .client .CoreV1 ().Pods (podToUpdate .Namespace ).Patch (context .TODO (), podToUpdate .Name , k8stypes .JSONPatchType , patch , metav1.PatchOptions {})
587569 if err != nil {
588- klog .ErrorS (err , "Failed to patch pod" , "pod" , klog .KObj (podToUpdate ))
589570 return err
590571 }
591572 klog .V (4 ).InfoS ("Patched pod annotations" , "pod" , klog .KObj (res ), "patches" , string (patch ))
592573 }
593574 } else {
594- err := fmt .Errorf ("no patches to apply to %s" , podToUpdate .Name )
595- klog .ErrorS (err , "Failed to patch pod" , "pod" , klog .KObj (podToUpdate ))
596- return err
575+ return fmt .Errorf ("no resource patches were calculated to apply" )
597576 }
598577
599578 // TODO(maxcao13): If this keeps getting called on the same object with the same reason, it is considered a patch request.
@@ -606,30 +585,47 @@ func (e *podsEvictionRestrictionImpl) InPlaceUpdate(podToUpdate *apiv1.Pod, vpa
606585 if podToUpdate .Status .Phase == apiv1 .PodRunning {
607586 singleGroupStats , present := e .creatorToSingleGroupStatsMap [cr ]
608587 if ! present {
609- klog .Errorf ("Internal error - cannot find stats for replication group %v " , cr )
588+ klog .InfoS ("Internal error - cannot find stats for replication group" , "pod" , klog . KObj ( podToUpdate ), "podReplicaCreator " , cr )
610589 } else {
611590 singleGroupStats .inPlaceUpdating = singleGroupStats .inPlaceUpdating + 1
612591 e .creatorToSingleGroupStatsMap [cr ] = singleGroupStats
613592 }
614593 } else {
615- klog .Warningf ( "I updated , but my pod phase was %s " , podToUpdate .Status .Phase )
594+ klog .InfoS ( "Attempted to in-place update , but pod was not running" , "pod" , klog . KObj ( podToUpdate ), "phase " , podToUpdate .Status .Phase )
616595 }
617596
618597 return nil
619598}
620599
621600// IsInPlaceUpdating checks whether or not the given pod is currently in the middle of an in-place update
622601func IsInPlaceUpdating (podToCheck * apiv1.Pod ) (isUpdating bool ) {
623- // If the pod is currently updating we need to tally that
624- if podToCheck .Status .Resize != "" {
625- klog .V (4 ).InfoS ("Pod is currently resizing" , "pod" , klog .KObj (podToCheck ), "status" , podToCheck .Status .Resize )
626- // Proposed -> Deferred -> InProgress, but what about Infeasible?
627- if podToCheck .Status .Resize == apiv1 .PodResizeStatusInfeasible {
628- klog .V (4 ).InfoS ("Resource proposal for pod is Infeasible, we're probably stuck like this until we evict" , "pod" , klog .KObj (podToCheck ))
629- } else if podToCheck .Status .Resize == apiv1 .PodResizeStatusDeferred {
630- klog .V (4 ).InfoS ("Resource proposal for pod is Deferred, we're probably stuck like this until we evict" , "pod" , klog .KObj (podToCheck ))
631- }
632- return true
602+ return podToCheck .Status .Resize != ""
603+ }
604+
605+ // UpdatablePod is a Pod wrapper that record helpful attributes of a Pod
606+ // to be used when attempting to evict or in-place update.
607+ type UpdatablePod struct {
608+ pod * apiv1.Pod
609+ disruptionless bool
610+ }
611+
612+ // Pod returns the underlying pod
613+ func (p * UpdatablePod ) Pod () * apiv1.Pod {
614+ return p .pod
615+ }
616+
617+ // IsDisruptionless returns the disruptionless status of the updatablePod
618+ func (p * UpdatablePod ) IsDisruptionless () bool {
619+ return p .disruptionless
620+ }
621+
622+ // SetDisruptionless sets the disruptionless status of the updatablePod to true
623+ func (p * UpdatablePod ) SetDisruptionless () {
624+ p .disruptionless = true
625+ }
626+
627+ func NewUpdatablePod (pod * apiv1.Pod ) * UpdatablePod {
628+ return & UpdatablePod {
629+ pod : pod ,
633630 }
634- return false
635631}
0 commit comments