Skip to content

Commit 7978c9e

Browse files
authored
Merge pull request #9118 from shaikenov/shaikenov-test-failedscaleup-metric
Cover the RegisterFailedScaleUp metric generation with unit tests
2 parents 5f6697d + c8f8089 commit 7978c9e

File tree

2 files changed

+77
-6
lines changed

2 files changed

+77
-6
lines changed

cluster-autoscaler/clusterstate/clusterstate.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ type ScaleUpFailure struct {
118118
Time time.Time
119119
}
120120

121+
type metricObserver interface {
122+
RegisterFailedScaleUp(reason metrics.FailedScaleUpReason, gpuResourceName, gpuType string)
123+
}
124+
121125
// ClusterStateRegistry is a structure to keep track the current state of the cluster.
122126
type ClusterStateRegistry struct {
123127
sync.Mutex
@@ -144,6 +148,7 @@ type ClusterStateRegistry struct {
144148
interrupt chan struct{}
145149
nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor
146150
asyncNodeGroupStateChecker asyncnodegroups.AsyncNodeGroupStateChecker
151+
metrics metricObserver
147152

148153
// scaleUpFailures contains information about scale-up failures for each node group. It should be
149154
// cleared periodically to avoid unnecessary accumulation.
@@ -159,6 +164,10 @@ type NodeGroupScalingSafety struct {
159164

160165
// NewClusterStateRegistry creates new ClusterStateRegistry.
161166
func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor, asyncNodeGroupStateChecker asyncnodegroups.AsyncNodeGroupStateChecker) *ClusterStateRegistry {
167+
return newClusterStateRegistry(cloudProvider, config, logRecorder, backoff, nodeGroupConfigProcessor, asyncNodeGroupStateChecker, metrics.DefaultMetrics)
168+
}
169+
170+
func newClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor, asyncNodeGroupStateChecker asyncnodegroups.AsyncNodeGroupStateChecker, metrics metricObserver) *ClusterStateRegistry {
162171
return &ClusterStateRegistry{
163172
scaleUpRequests: make(map[string]*ScaleUpRequest),
164173
scaleDownRequests: make([]*ScaleDownRequest, 0),
@@ -179,6 +188,7 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C
179188
scaleUpFailures: make(map[string][]ScaleUpFailure),
180189
nodeGroupConfigProcessor: nodeGroupConfigProcessor,
181190
asyncNodeGroupStateChecker: asyncNodeGroupStateChecker,
191+
metrics: metrics,
182192
}
183193
}
184194

@@ -347,7 +357,7 @@ func (csr *ClusterStateRegistry) RegisterFailedScaleDown(_ cloudprovider.NodeGro
347357

348358
func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorInfo cloudprovider.InstanceErrorInfo, gpuResourceName, gpuType string, currentTime time.Time) {
349359
csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime})
350-
metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType)
360+
csr.metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType)
351361
csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime)
352362
}
353363

cluster-autoscaler/clusterstate/clusterstate_test.go

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,16 @@ import (
3232
"k8s.io/autoscaler/cluster-autoscaler/metrics"
3333
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupconfig"
3434
"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroups/asyncnodegroups"
35+
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
3536

3637
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
3738
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
3839
"k8s.io/client-go/kubernetes/fake"
3940
kube_record "k8s.io/client-go/tools/record"
4041

4142
"github.com/stretchr/testify/assert"
43+
"github.com/stretchr/testify/mock"
44+
mockprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mocks"
4245
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
4346
)
4447

@@ -549,14 +552,17 @@ func TestExpiredScaleUp(t *testing.T) {
549552
assert.NotNil(t, provider)
550553

551554
fakeClient := &fake.Clientset{}
555+
mockMetrics := &mockMetrics{}
556+
mockMetrics.On("RegisterFailedScaleUp", mock.Anything, mock.Anything, mock.Anything).Return()
552557
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
553-
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
558+
clusterstate := newClusterStateRegistry(provider, ClusterStateRegistryConfig{
554559
MaxTotalUnreadyPercentage: 10,
555560
OkTotalUnreadyCount: 1,
556-
}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 2 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
561+
}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 2 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker(), mockMetrics)
557562
clusterstate.RegisterScaleUp(provider.GetNodeGroup("ng1"), 4, now.Add(-3*time.Minute))
558563
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
559564
assert.NoError(t, err)
565+
mockMetrics.AssertCalled(t, "RegisterFailedScaleUp", metrics.Timeout, "", "")
560566
assert.True(t, clusterstate.IsClusterHealthy())
561567
assert.False(t, clusterstate.IsNodeGroupHealthy("ng1"))
562568
assert.Equal(t, clusterstate.GetScaleUpFailures(), map[string][]ScaleUpFailure{
@@ -915,18 +921,21 @@ func TestScaleUpBackoff(t *testing.T) {
915921
assert.NotNil(t, provider)
916922

917923
fakeClient := &fake.Clientset{}
924+
mockMetrics := &mockMetrics{}
925+
mockMetrics.On("RegisterFailedScaleUp", mock.Anything, mock.Anything, mock.Anything).Return()
918926
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
919-
clusterstate := NewClusterStateRegistry(
927+
clusterstate := newClusterStateRegistry(
920928
provider, ClusterStateRegistryConfig{
921929
MaxTotalUnreadyPercentage: 10,
922930
OkTotalUnreadyCount: 1,
923931
}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 120 * time.Second}),
924-
asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
932+
asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker(), mockMetrics)
925933

926934
// After failed scale-up, node group should be still healthy, but should backoff from scale-ups
927935
clusterstate.RegisterScaleUp(provider.GetNodeGroup("ng1"), 1, now.Add(-180*time.Second))
928936
err := clusterstate.UpdateNodes([]*apiv1.Node{ng1_1, ng1_2, ng1_3}, nil, now)
929937
assert.NoError(t, err)
938+
mockMetrics.AssertCalled(t, "RegisterFailedScaleUp", metrics.Timeout, "", "")
930939
assert.True(t, clusterstate.IsClusterHealthy())
931940
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
932941
assert.Equal(t, NodeGroupScalingSafety{
@@ -1134,11 +1143,16 @@ func TestScaleUpFailures(t *testing.T) {
11341143

11351144
fakeClient := &fake.Clientset{}
11361145
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
1137-
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
1146+
mockMetrics := &mockMetrics{}
1147+
mockMetrics.On("RegisterFailedScaleUp", mock.Anything, mock.Anything, mock.Anything).Return()
1148+
clusterstate := newClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker(), mockMetrics)
11381149

11391150
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), string(metrics.Timeout), "", "", "", now)
1151+
mockMetrics.AssertCalled(t, "RegisterFailedScaleUp", metrics.Timeout, "", "")
11401152
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), string(metrics.Timeout), "", "", "", now)
1153+
mockMetrics.AssertCalled(t, "RegisterFailedScaleUp", metrics.Timeout, "", "")
11411154
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), string(metrics.APIError), "", "", "", now.Add(time.Minute))
1155+
mockMetrics.AssertCalled(t, "RegisterFailedScaleUp", metrics.APIError, "", "")
11421156

11431157
failures := clusterstate.GetScaleUpFailures()
11441158
assert.Equal(t, map[string][]ScaleUpFailure{
@@ -1631,3 +1645,50 @@ func TestUpcomingNodesFromUpcomingNodeGroups(t *testing.T) {
16311645
}
16321646

16331647
}
1648+
1649+
func TestHandleInstanceCreationErrors(t *testing.T) {
1650+
now := time.Now()
1651+
1652+
provider := testprovider.NewTestCloudProviderBuilder().Build()
1653+
mockedNodeGroup := &mockprovider.NodeGroup{}
1654+
mockedNodeGroup.On("Id").Return("ng1")
1655+
mockedNodeGroup.On("Nodes").Return([]cloudprovider.Instance{
1656+
{
1657+
Id: "instance1",
1658+
Status: &cloudprovider.InstanceStatus{
1659+
State: cloudprovider.InstanceCreating,
1660+
ErrorInfo: &cloudprovider.InstanceErrorInfo{
1661+
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
1662+
ErrorCode: "RESOURCE_POOL_EXHAUSTED",
1663+
ErrorMessage: "",
1664+
},
1665+
},
1666+
},
1667+
}, nil)
1668+
mockedNodeGroup.On("Autoprovisioned").Return(false)
1669+
mockedNodeGroup.On("TargetSize").Return(1, nil)
1670+
node := BuildTestNode("ng1_1", 1000, 1000)
1671+
mockedNodeGroup.On("TemplateNodeInfo").Return(framework.NewTestNodeInfo(node), nil)
1672+
mockedNodeGroup.On("GetOptions", mock.Anything).Return(&config.NodeGroupAutoscalingOptions{}, nil)
1673+
provider.InsertNodeGroup(mockedNodeGroup)
1674+
1675+
fakeClient := &fake.Clientset{}
1676+
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
1677+
mockMetrics := &mockMetrics{}
1678+
mockMetrics.On("RegisterFailedScaleUp", mock.Anything, mock.Anything, mock.Anything).Return()
1679+
clusterstate := newClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker(), mockMetrics)
1680+
clusterstate.RegisterScaleUp(mockedNodeGroup, 1, now)
1681+
1682+
// UpdateNodes will trigger handleInstanceCreationErrors
1683+
err := clusterstate.UpdateNodes([]*apiv1.Node{}, nil, now)
1684+
assert.NoError(t, err)
1685+
mockMetrics.AssertCalled(t, "RegisterFailedScaleUp", metrics.FailedScaleUpReason("RESOURCE_POOL_EXHAUSTED"), "", "")
1686+
}
1687+
1688+
type mockMetrics struct {
1689+
mock.Mock
1690+
}
1691+
1692+
func (m *mockMetrics) RegisterFailedScaleUp(reason metrics.FailedScaleUpReason, gpuResourceName, gpuType string) {
1693+
m.Called(reason, gpuResourceName, gpuType)
1694+
}

0 commit comments

Comments
 (0)