@@ -32,13 +32,16 @@ import (
3232 "k8s.io/autoscaler/cluster-autoscaler/metrics"
3333 "k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupconfig"
3434 "k8s.io/autoscaler/cluster-autoscaler/processors/nodegroups/asyncnodegroups"
35+ "k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
3536
3637 "k8s.io/autoscaler/cluster-autoscaler/utils/taints"
3738 . "k8s.io/autoscaler/cluster-autoscaler/utils/test"
3839 "k8s.io/client-go/kubernetes/fake"
3940 kube_record "k8s.io/client-go/tools/record"
4041
4142 "github.com/stretchr/testify/assert"
43+ "github.com/stretchr/testify/mock"
44+ mockprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mocks"
4245 "k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
4346)
4447
@@ -549,14 +552,17 @@ func TestExpiredScaleUp(t *testing.T) {
549552 assert .NotNil (t , provider )
550553
551554 fakeClient := & fake.Clientset {}
555+ mockMetrics := & mockMetrics {}
556+ mockMetrics .On ("RegisterFailedScaleUp" , mock .Anything , mock .Anything , mock .Anything ).Return ()
552557 fakeLogRecorder , _ := utils .NewStatusMapRecorder (fakeClient , "kube-system" , kube_record .NewFakeRecorder (5 ), false , "my-cool-configmap" )
553- clusterstate := NewClusterStateRegistry (provider , ClusterStateRegistryConfig {
558+ clusterstate := newClusterStateRegistry (provider , ClusterStateRegistryConfig {
554559 MaxTotalUnreadyPercentage : 10 ,
555560 OkTotalUnreadyCount : 1 ,
556- }, fakeLogRecorder , newBackoff (), nodegroupconfig .NewDefaultNodeGroupConfigProcessor (config.NodeGroupAutoscalingOptions {MaxNodeProvisionTime : 2 * time .Minute }), asyncnodegroups .NewDefaultAsyncNodeGroupStateChecker ())
561+ }, fakeLogRecorder , newBackoff (), nodegroupconfig .NewDefaultNodeGroupConfigProcessor (config.NodeGroupAutoscalingOptions {MaxNodeProvisionTime : 2 * time .Minute }), asyncnodegroups .NewDefaultAsyncNodeGroupStateChecker (), mockMetrics )
557562 clusterstate .RegisterScaleUp (provider .GetNodeGroup ("ng1" ), 4 , now .Add (- 3 * time .Minute ))
558563 err := clusterstate .UpdateNodes ([]* apiv1.Node {ng1_1 }, nil , now )
559564 assert .NoError (t , err )
565+ mockMetrics .AssertCalled (t , "RegisterFailedScaleUp" , metrics .Timeout , "" , "" )
560566 assert .True (t , clusterstate .IsClusterHealthy ())
561567 assert .False (t , clusterstate .IsNodeGroupHealthy ("ng1" ))
562568 assert .Equal (t , clusterstate .GetScaleUpFailures (), map [string ][]ScaleUpFailure {
@@ -915,18 +921,21 @@ func TestScaleUpBackoff(t *testing.T) {
915921 assert .NotNil (t , provider )
916922
917923 fakeClient := & fake.Clientset {}
924+ mockMetrics := & mockMetrics {}
925+ mockMetrics .On ("RegisterFailedScaleUp" , mock .Anything , mock .Anything , mock .Anything ).Return ()
918926 fakeLogRecorder , _ := utils .NewStatusMapRecorder (fakeClient , "kube-system" , kube_record .NewFakeRecorder (5 ), false , "my-cool-configmap" )
919- clusterstate := NewClusterStateRegistry (
927+ clusterstate := newClusterStateRegistry (
920928 provider , ClusterStateRegistryConfig {
921929 MaxTotalUnreadyPercentage : 10 ,
922930 OkTotalUnreadyCount : 1 ,
923931 }, fakeLogRecorder , newBackoff (), nodegroupconfig .NewDefaultNodeGroupConfigProcessor (config.NodeGroupAutoscalingOptions {MaxNodeProvisionTime : 120 * time .Second }),
924- asyncnodegroups .NewDefaultAsyncNodeGroupStateChecker ())
932+ asyncnodegroups .NewDefaultAsyncNodeGroupStateChecker (), mockMetrics )
925933
926934 // After failed scale-up, node group should be still healthy, but should backoff from scale-ups
927935 clusterstate .RegisterScaleUp (provider .GetNodeGroup ("ng1" ), 1 , now .Add (- 180 * time .Second ))
928936 err := clusterstate .UpdateNodes ([]* apiv1.Node {ng1_1 , ng1_2 , ng1_3 }, nil , now )
929937 assert .NoError (t , err )
938+ mockMetrics .AssertCalled (t , "RegisterFailedScaleUp" , metrics .Timeout , "" , "" )
930939 assert .True (t , clusterstate .IsClusterHealthy ())
931940 assert .True (t , clusterstate .IsNodeGroupHealthy ("ng1" ))
932941 assert .Equal (t , NodeGroupScalingSafety {
@@ -1134,11 +1143,16 @@ func TestScaleUpFailures(t *testing.T) {
11341143
11351144 fakeClient := & fake.Clientset {}
11361145 fakeLogRecorder , _ := utils .NewStatusMapRecorder (fakeClient , "kube-system" , kube_record .NewFakeRecorder (5 ), false , "my-cool-configmap" )
1137- clusterstate := NewClusterStateRegistry (provider , ClusterStateRegistryConfig {}, fakeLogRecorder , newBackoff (), nodegroupconfig .NewDefaultNodeGroupConfigProcessor (config.NodeGroupAutoscalingOptions {MaxNodeProvisionTime : 15 * time .Minute }), asyncnodegroups .NewDefaultAsyncNodeGroupStateChecker ())
1146+ mockMetrics := & mockMetrics {}
1147+ mockMetrics .On ("RegisterFailedScaleUp" , mock .Anything , mock .Anything , mock .Anything ).Return ()
1148+ clusterstate := newClusterStateRegistry (provider , ClusterStateRegistryConfig {}, fakeLogRecorder , newBackoff (), nodegroupconfig .NewDefaultNodeGroupConfigProcessor (config.NodeGroupAutoscalingOptions {MaxNodeProvisionTime : 15 * time .Minute }), asyncnodegroups .NewDefaultAsyncNodeGroupStateChecker (), mockMetrics )
11381149
11391150 clusterstate .RegisterFailedScaleUp (provider .GetNodeGroup ("ng1" ), string (metrics .Timeout ), "" , "" , "" , now )
1151+ mockMetrics .AssertCalled (t , "RegisterFailedScaleUp" , metrics .Timeout , "" , "" )
11401152 clusterstate .RegisterFailedScaleUp (provider .GetNodeGroup ("ng2" ), string (metrics .Timeout ), "" , "" , "" , now )
1153+ mockMetrics .AssertCalled (t , "RegisterFailedScaleUp" , metrics .Timeout , "" , "" )
11411154 clusterstate .RegisterFailedScaleUp (provider .GetNodeGroup ("ng1" ), string (metrics .APIError ), "" , "" , "" , now .Add (time .Minute ))
1155+ mockMetrics .AssertCalled (t , "RegisterFailedScaleUp" , metrics .APIError , "" , "" )
11421156
11431157 failures := clusterstate .GetScaleUpFailures ()
11441158 assert .Equal (t , map [string ][]ScaleUpFailure {
@@ -1631,3 +1645,50 @@ func TestUpcomingNodesFromUpcomingNodeGroups(t *testing.T) {
16311645 }
16321646
16331647}
1648+
1649+ func TestHandleInstanceCreationErrors (t * testing.T ) {
1650+ now := time .Now ()
1651+
1652+ provider := testprovider .NewTestCloudProviderBuilder ().Build ()
1653+ mockedNodeGroup := & mockprovider.NodeGroup {}
1654+ mockedNodeGroup .On ("Id" ).Return ("ng1" )
1655+ mockedNodeGroup .On ("Nodes" ).Return ([]cloudprovider.Instance {
1656+ {
1657+ Id : "instance1" ,
1658+ Status : & cloudprovider.InstanceStatus {
1659+ State : cloudprovider .InstanceCreating ,
1660+ ErrorInfo : & cloudprovider.InstanceErrorInfo {
1661+ ErrorClass : cloudprovider .OutOfResourcesErrorClass ,
1662+ ErrorCode : "RESOURCE_POOL_EXHAUSTED" ,
1663+ ErrorMessage : "" ,
1664+ },
1665+ },
1666+ },
1667+ }, nil )
1668+ mockedNodeGroup .On ("Autoprovisioned" ).Return (false )
1669+ mockedNodeGroup .On ("TargetSize" ).Return (1 , nil )
1670+ node := BuildTestNode ("ng1_1" , 1000 , 1000 )
1671+ mockedNodeGroup .On ("TemplateNodeInfo" ).Return (framework .NewTestNodeInfo (node ), nil )
1672+ mockedNodeGroup .On ("GetOptions" , mock .Anything ).Return (& config.NodeGroupAutoscalingOptions {}, nil )
1673+ provider .InsertNodeGroup (mockedNodeGroup )
1674+
1675+ fakeClient := & fake.Clientset {}
1676+ fakeLogRecorder , _ := utils .NewStatusMapRecorder (fakeClient , "kube-system" , kube_record .NewFakeRecorder (5 ), false , "my-cool-configmap" )
1677+ mockMetrics := & mockMetrics {}
1678+ mockMetrics .On ("RegisterFailedScaleUp" , mock .Anything , mock .Anything , mock .Anything ).Return ()
1679+ clusterstate := newClusterStateRegistry (provider , ClusterStateRegistryConfig {}, fakeLogRecorder , newBackoff (), nodegroupconfig .NewDefaultNodeGroupConfigProcessor (config.NodeGroupAutoscalingOptions {MaxNodeProvisionTime : 15 * time .Minute }), asyncnodegroups .NewDefaultAsyncNodeGroupStateChecker (), mockMetrics )
1680+ clusterstate .RegisterScaleUp (mockedNodeGroup , 1 , now )
1681+
1682+ // UpdateNodes will trigger handleInstanceCreationErrors
1683+ err := clusterstate .UpdateNodes ([]* apiv1.Node {}, nil , now )
1684+ assert .NoError (t , err )
1685+ mockMetrics .AssertCalled (t , "RegisterFailedScaleUp" , metrics .FailedScaleUpReason ("RESOURCE_POOL_EXHAUSTED" ), "" , "" )
1686+ }
1687+
1688+ type mockMetrics struct {
1689+ mock.Mock
1690+ }
1691+
1692+ func (m * mockMetrics ) RegisterFailedScaleUp (reason metrics.FailedScaleUpReason , gpuResourceName , gpuType string ) {
1693+ m .Called (reason , gpuResourceName , gpuType )
1694+ }
0 commit comments