Skip to content

Commit 47da0c2

Browse files
committed
VPA: add in-place VPA e2e suite
So far this is just: - Checks to see if InPlacePodVerticalScaling is in use - Make sure it scales when it can But we still need a bunch of other ones like - Test fallback to eviction - Test timeout/eviction when it gets stuck, etc ( Maybe once this works we can squish it together with the updater test but until then it's separate )
1 parent 701d53a commit 47da0c2

File tree

2 files changed

+372
-0
lines changed

2 files changed

+372
-0
lines changed

vertical-pod-autoscaler/e2e/v1/common.go

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ const (
4747
admissionControllerComponent = "admission-controller"
4848
fullVpaSuite = "full-vpa"
4949
actuationSuite = "actuation"
50+
inPlaceSuite = "in-place"
5051
pollInterval = 10 * time.Second
5152
pollTimeout = 15 * time.Minute
5253
cronJobsWaitTimeout = 15 * time.Minute
@@ -86,6 +87,11 @@ func UpdaterE2eDescribe(name string, body func()) bool {
8687
return E2eDescribe(updateComponent, name, body)
8788
}
8889

90+
// InPlaceE2eDescribe describes a VPA updater e2e test.
91+
func InPlaceE2eDescribe(name string, body func()) bool {
92+
return E2eDescribe(inPlaceSuite, name, body)
93+
}
94+
8995
// AdmissionControllerE2eDescribe describes a VPA admission controller e2e test.
9096
func AdmissionControllerE2eDescribe(name string, body func()) bool {
9197
return E2eDescribe(admissionControllerComponent, name, body)
@@ -128,6 +134,24 @@ func SetupHamsterDeployment(f *framework.Framework, cpu, memory string, replicas
128134
return d
129135
}
130136

137+
// SetupHamsterDeploymentWithLimits creates and installs a simple hamster deployment with resource limits
138+
// for e2e test purposes, then makes sure the deployment is running. The limits are necessary for the in-place tests,
139+
// as InPlacePodVerticalScaling does not currently work if limits are not populated.
140+
func SetupHamsterDeploymentWithLimits(f *framework.Framework, cpu, memory, cpuLimit, memoryLimit string, replicas int32) *appsv1.Deployment {
141+
cpuQuantity := ParseQuantityOrDie(cpu)
142+
memoryQuantity := ParseQuantityOrDie(memory)
143+
cpuLimitQuantity := ParseQuantityOrDie(cpuLimit)
144+
memoryLimitQuantity := ParseQuantityOrDie(memoryLimit)
145+
146+
d := NewHamsterDeploymentWithResourcesAndLimits(f, cpuQuantity, memoryQuantity, cpuLimitQuantity, memoryLimitQuantity)
147+
d.Spec.Replicas = &replicas
148+
d, err := f.ClientSet.AppsV1().Deployments(f.Namespace.Name).Create(context.TODO(), d, metav1.CreateOptions{})
149+
gomega.Expect(err).NotTo(gomega.HaveOccurred(), "unexpected error when starting deployment creation")
150+
err = framework_deployment.WaitForDeploymentComplete(f.ClientSet, d)
151+
gomega.Expect(err).NotTo(gomega.HaveOccurred(), "unexpected error waiting for deployment creation to finish")
152+
return d
153+
}
154+
131155
// NewHamsterDeployment creates a simple hamster deployment for e2e test purposes.
132156
func NewHamsterDeployment(f *framework.Framework) *appsv1.Deployment {
133157
return NewNHamstersDeployment(f, 1)
@@ -566,3 +590,104 @@ func InstallLimitRangeWithMin(f *framework.Framework, minCpuLimit, minMemoryLimi
566590
minMemoryLimitQuantity := ParseQuantityOrDie(minMemoryLimit)
567591
installLimitRange(f, &minCpuLimitQuantity, &minMemoryLimitQuantity, nil, nil, lrType)
568592
}
593+
594+
// WaitForPodsUpdatedWithoutEviction waits for pods to be updated without any evictions taking place over the polling
595+
// interval.
596+
func WaitForPodsUpdatedWithoutEviction(f *framework.Framework, initialPods, podList *apiv1.PodList) error {
597+
// TODO(jkyros): This needs to be:
598+
// 1. Make sure we wait for each of the containers to get an update queued
599+
// 2. Make sure each of the containers actually finish the update
600+
// 3. Once everyone has gone through 1 cycle, we don't care anymore, we can move on (it will keep scaling obviously)
601+
framework.Logf("waiting for update to start and resources to differ")
602+
var resourcesHaveDiffered bool
603+
err := wait.PollUntilContextTimeout(context.TODO(), pollInterval, pollTimeout, false, func(context.Context) (bool, error) {
604+
// TODO(jkyros): make sure we don't update too many pods at once
605+
podList, err := GetHamsterPods(f)
606+
if err != nil {
607+
return false, err
608+
}
609+
resourcesAreSynced := true
610+
podMissing := false
611+
// Go through the list of initial pods
612+
for _, initialPod := range initialPods.Items {
613+
found := false
614+
// Go through the list of pods we have now
615+
for _, pod := range podList.Items {
616+
// If we still have our initial pod, good
617+
if initialPod.Name == pod.Name {
618+
found = true
619+
620+
// Check to see if we have our container resources updated
621+
for num, container := range pod.Spec.Containers {
622+
// If our current spec differs from initial, we know we were told to update
623+
if !resourcesHaveDiffered {
624+
for resourceName, resourceLimit := range container.Resources.Limits {
625+
initialResourceLimit := initialPod.Spec.Containers[num].Resources.Limits[resourceName]
626+
if !initialResourceLimit.Equal(resourceLimit) {
627+
framework.Logf("E: %s/%s: %s limit (%v) differs from initial (%v), change has started ", pod.Name, container.Name, resourceName, resourceLimit.String(), initialResourceLimit.String())
628+
//fmt.Printf("UPD: L:%s: %s/%s %v differs from initial %v\n", resourceName, pod.Name, container.Name, resourceLimit, pod.Status.ContainerStatuses[num].Resources.Limits[resourceName])
629+
resourcesHaveDiffered = true
630+
631+
}
632+
633+
}
634+
for resourceName, resourceRequest := range container.Resources.Requests {
635+
initialResourceRequest := initialPod.Spec.Containers[num].Resources.Requests[resourceName]
636+
if !initialResourceRequest.Equal(resourceRequest) {
637+
framework.Logf("%s/%s: %s request (%v) differs from initial (%v), change has started ", pod.Name, container.Name, resourceName, resourceRequest.String(), initialResourceRequest.String())
638+
resourcesHaveDiffered = true
639+
640+
}
641+
}
642+
}
643+
644+
if len(pod.Status.ContainerStatuses) > num {
645+
if pod.Status.ContainerStatuses[num].Resources != nil {
646+
for resourceName, resourceLimit := range container.Resources.Limits {
647+
statusResourceLimit := pod.Status.ContainerStatuses[num].Resources.Limits[resourceName]
648+
if !statusResourceLimit.Equal(resourceLimit) {
649+
framework.Logf("%s/%s: %s limit status (%v) differs from limit spec (%v), still in progress", pod.Name, container.Name, resourceName, resourceLimit.String(), statusResourceLimit.String())
650+
651+
resourcesAreSynced = false
652+
653+
}
654+
655+
}
656+
for resourceName, resourceRequest := range container.Resources.Requests {
657+
statusResourceRequest := pod.Status.ContainerStatuses[num].Resources.Requests[resourceName]
658+
if !pod.Status.ContainerStatuses[num].Resources.Requests[resourceName].Equal(resourceRequest) {
659+
framework.Logf("%s/%s: %s request status (%v) differs from request spec(%v), still in progress ", pod.Name, container.Name, resourceName, resourceRequest.String(), statusResourceRequest.String())
660+
resourcesAreSynced = false
661+
662+
}
663+
}
664+
665+
} else {
666+
framework.Logf("SOMEHOW ITS EMPTY\n")
667+
}
668+
}
669+
670+
}
671+
}
672+
673+
}
674+
if !found {
675+
//framework.Logf("pod %s was evicted and should not have been\n", initialPod.Name)
676+
podMissing = true
677+
}
678+
679+
}
680+
if podMissing {
681+
return false, fmt.Errorf("a pod was erroneously evicted")
682+
}
683+
if len(podList.Items) > 0 && resourcesAreSynced {
684+
if !resourcesHaveDiffered {
685+
return false, nil
686+
}
687+
framework.Logf("after checking %d pods, were are in sync\n", len(podList.Items))
688+
return true, nil
689+
}
690+
return false, nil
691+
})
692+
return err
693+
}
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package autoscaling
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"time"
23+
24+
autoscaling "k8s.io/api/autoscaling/v1"
25+
apiv1 "k8s.io/api/core/v1"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
28+
"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/status"
29+
"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/test"
30+
"k8s.io/kubernetes/test/e2e/framework"
31+
podsecurity "k8s.io/pod-security-admission/api"
32+
33+
ginkgo "github.com/onsi/ginkgo/v2"
34+
"github.com/onsi/gomega"
35+
)
36+
37+
var _ = InPlaceE2eDescribe("In-Place", func() {
38+
39+
f := framework.NewDefaultFramework("vertical-pod-autoscaling")
40+
f.NamespacePodSecurityEnforceLevel = podsecurity.LevelBaseline
41+
42+
// TODO(jkyros): clean this up, some kind of helper, stash the InPlacePodVerticalScalingInUse function somewhere
43+
// useful
44+
// TODO(jkyros); the in-place tests check first to see if in-place is in use, and if it's not, there's nothing to test. I bet there's
45+
// precedent on how to test a gated feature with ginkgo, I should find out what it is
46+
var InPlacePodVerticalScalingNotInUse bool
47+
ginkgo.It("Should have InPlacePodVerticalScaling in-use", func() {
48+
49+
ginkgo.By("Verifying the existence of container ResizePolicy")
50+
checkPod := &apiv1.Pod{}
51+
checkPod.Name = "inplace"
52+
checkPod.Namespace = f.Namespace.Name
53+
checkPod.Spec.Containers = append(checkPod.Spec.Containers, SetupHamsterContainer("100m", "10Mi"))
54+
_, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(context.Background(), checkPod, metav1.CreateOptions{})
55+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
56+
checkPod, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.Background(), checkPod.Name, metav1.GetOptions{})
57+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
58+
59+
if !InPlacePodVerticalScalingInUse(checkPod) {
60+
InPlacePodVerticalScalingNotInUse = true
61+
ginkgo.Skip("InPlacePodVerticalScaling was not in use (containers had no ResizePolicy)")
62+
}
63+
})
64+
65+
ginkgo.It("In-place update pods when Admission Controller status available", func() {
66+
if InPlacePodVerticalScalingNotInUse {
67+
ginkgo.Skip("InPlacePodVerticalScaling was not in use (containers had no ResizePolicy)")
68+
}
69+
const statusUpdateInterval = 10 * time.Second
70+
71+
ginkgo.By("Setting up the Admission Controller status")
72+
stopCh := make(chan struct{})
73+
statusUpdater := status.NewUpdater(
74+
f.ClientSet,
75+
status.AdmissionControllerStatusName,
76+
status.AdmissionControllerStatusNamespace,
77+
statusUpdateInterval,
78+
"e2e test",
79+
)
80+
defer func() {
81+
// Schedule a cleanup of the Admission Controller status.
82+
// Status is created outside the test namespace.
83+
ginkgo.By("Deleting the Admission Controller status")
84+
close(stopCh)
85+
err := f.ClientSet.CoordinationV1().Leases(status.AdmissionControllerStatusNamespace).
86+
Delete(context.TODO(), status.AdmissionControllerStatusName, metav1.DeleteOptions{})
87+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
88+
}()
89+
statusUpdater.Run(stopCh)
90+
91+
podList := setupPodsForUpscalingInPlace(f)
92+
if len(podList.Items[0].Spec.Containers[0].ResizePolicy) <= 0 {
93+
// Feature is probably not working here
94+
ginkgo.Skip("Skipping test, InPlacePodVerticalScaling not available")
95+
}
96+
97+
initialPods := podList.DeepCopy()
98+
// 1. Take initial pod list
99+
// 2. Loop through and compare all the resource values
100+
// 3. When they change, it's good
101+
102+
ginkgo.By("Waiting for pods to be in-place updated")
103+
104+
//gomega.Expect(err).NotTo(gomega.HaveOccurred())
105+
err := WaitForPodsUpdatedWithoutEviction(f, initialPods, podList)
106+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
107+
})
108+
109+
ginkgo.It("Does not evict pods for downscaling in-place", func() {
110+
if InPlacePodVerticalScalingNotInUse {
111+
ginkgo.Skip("InPlacePodVerticalScaling was not in use (containers had no ResizePolicy)")
112+
}
113+
const statusUpdateInterval = 10 * time.Second
114+
115+
ginkgo.By("Setting up the Admission Controller status")
116+
stopCh := make(chan struct{})
117+
statusUpdater := status.NewUpdater(
118+
f.ClientSet,
119+
status.AdmissionControllerStatusName,
120+
status.AdmissionControllerStatusNamespace,
121+
statusUpdateInterval,
122+
"e2e test",
123+
)
124+
defer func() {
125+
// Schedule a cleanup of the Admission Controller status.
126+
// Status is created outside the test namespace.
127+
ginkgo.By("Deleting the Admission Controller status")
128+
close(stopCh)
129+
err := f.ClientSet.CoordinationV1().Leases(status.AdmissionControllerStatusNamespace).
130+
Delete(context.TODO(), status.AdmissionControllerStatusName, metav1.DeleteOptions{})
131+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
132+
}()
133+
statusUpdater.Run(stopCh)
134+
135+
podList := setupPodsForDownscalingInPlace(f, nil)
136+
if len(podList.Items[0].Spec.Containers[0].ResizePolicy) <= 0 {
137+
// Feature is probably not working here
138+
ginkgo.Skip("Skipping test, InPlacePodVerticalScaling not available")
139+
}
140+
initialPods := podList.DeepCopy()
141+
142+
ginkgo.By("Waiting for pods to be in-place downscaled")
143+
err := WaitForPodsUpdatedWithoutEviction(f, initialPods, podList)
144+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
145+
})
146+
147+
ginkgo.It("evicts pods when Admission Controller status available", func() {
148+
if InPlacePodVerticalScalingNotInUse {
149+
ginkgo.Skip("InPlacePodVerticalScaling was not in use (containers had no ResizePolicy)")
150+
}
151+
152+
const statusUpdateInterval = 10 * time.Second
153+
154+
ginkgo.By("Setting up the Admission Controller status")
155+
stopCh := make(chan struct{})
156+
statusUpdater := status.NewUpdater(
157+
f.ClientSet,
158+
status.AdmissionControllerStatusName,
159+
status.AdmissionControllerStatusNamespace,
160+
statusUpdateInterval,
161+
"e2e test",
162+
)
163+
defer func() {
164+
// Schedule a cleanup of the Admission Controller status.
165+
// Status is created outside the test namespace.
166+
ginkgo.By("Deleting the Admission Controller status")
167+
close(stopCh)
168+
err := f.ClientSet.CoordinationV1().Leases(status.AdmissionControllerStatusNamespace).
169+
Delete(context.TODO(), status.AdmissionControllerStatusName, metav1.DeleteOptions{})
170+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
171+
}()
172+
statusUpdater.Run(stopCh)
173+
174+
podList := setupPodsForUpscalingEviction(f)
175+
176+
ginkgo.By("Waiting for pods to be evicted")
177+
err := WaitForPodsEvicted(f, podList)
178+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
179+
})
180+
181+
// TODO(jkyros):
182+
// - It falls back to eviction when in-place is Deferred for a minute (hard to fake, depends on node size)
183+
// - It falls back to eviction when in-place is Infeasible (easy to fake)
184+
// - It falls back to eviction when InProgress for more than an hour (maybe fake with annotation?)
185+
186+
})
187+
188+
func setupPodsForUpscalingInPlace(f *framework.Framework) *apiv1.PodList {
189+
return setupPodsForInPlace(f, "100m", "100Mi", "200m", "200Mi", nil)
190+
}
191+
192+
func setupPodsForDownscalingInPlace(f *framework.Framework, er []*vpa_types.EvictionRequirement) *apiv1.PodList {
193+
return setupPodsForInPlace(f, "500m", "500Mi", "600m", "600Mi", er)
194+
}
195+
196+
func setupPodsForInPlace(f *framework.Framework, hamsterCPU, hamsterMemory, hamsterCPULimit, hamsterMemoryLimit string, er []*vpa_types.EvictionRequirement) *apiv1.PodList {
197+
controller := &autoscaling.CrossVersionObjectReference{
198+
APIVersion: "apps/v1",
199+
Kind: "Deployment",
200+
Name: "hamster-deployment",
201+
}
202+
ginkgo.By(fmt.Sprintf("Setting up a hamster %v", controller.Kind))
203+
// TODO(jkyros): I didn't want to mangle all the plumbing just yet
204+
//setupHamsterController(f, controller.Kind, hamsterCPU, hamsterMemory, defaultHamsterReplicas)
205+
206+
// TODO(jkyros): we can't in-place scale without limits right now because of
207+
// https://github.com/kubernetes/kubernetes/blob/f4e246bc93ffb68b33ed67c7896c379efa4207e7/pkg/kubelet/kuberuntime/kuberuntime_manager.go#L550,
208+
// so if we want this to work, we need to add limits for now until we adjust that (assuming we can)
209+
SetupHamsterDeploymentWithLimits(f, hamsterCPU, hamsterMemory, hamsterCPULimit, hamsterMemoryLimit, defaultHamsterReplicas)
210+
podList, err := GetHamsterPods(f)
211+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
212+
213+
ginkgo.By("Setting up a VPA CRD")
214+
containerName := GetHamsterContainerNameByIndex(0)
215+
vpaCRD := test.VerticalPodAutoscaler().
216+
WithName("hamster-vpa").
217+
WithNamespace(f.Namespace.Name).
218+
WithTargetRef(controller).
219+
WithUpdateMode(vpa_types.UpdateModeInPlaceOrRecreate).
220+
WithEvictionRequirements(er).
221+
WithContainer(containerName).
222+
AppendRecommendation(
223+
test.Recommendation().
224+
WithContainer(containerName).
225+
WithTarget(containerName, "200m").
226+
WithLowerBound(containerName, "200m").
227+
WithUpperBound(containerName, "200m").
228+
GetContainerResources()).
229+
Get()
230+
231+
InstallVPA(f, vpaCRD)
232+
233+
return podList
234+
}
235+
236+
// InPlacePodVerticalScalingInUse returns true if pod spec is non-nil and ResizePolicy is set
237+
func InPlacePodVerticalScalingInUse(pod *apiv1.Pod) bool {
238+
if pod == nil {
239+
return false
240+
}
241+
for _, container := range pod.Spec.Containers {
242+
if len(container.ResizePolicy) > 0 {
243+
return true
244+
}
245+
}
246+
return false
247+
}

0 commit comments

Comments
 (0)