diff --git a/e2e/suites/gke.go b/e2e/suites/gke.go index 76c4afac..addc6970 100644 --- a/e2e/suites/gke.go +++ b/e2e/suites/gke.go @@ -148,7 +148,7 @@ func (ts *gkeTestSuite) Run(ctx context.Context, t *testing.T) { } if deployment.Status.UnavailableReplicas != 1 { - return fmt.Errorf("nginx replica running") + return fmt.Errorf("%v replica running: %v", deployment.Name, deployment.Status.UnavailableReplicas) } return nil diff --git a/go.mod b/go.mod index 9394bb71..2f590a6f 100644 --- a/go.mod +++ b/go.mod @@ -22,12 +22,12 @@ require ( golang.org/x/net v0.41.0 golang.org/x/sync v0.16.0 helm.sh/helm/v3 v3.17.3 - k8s.io/api v0.32.2 + k8s.io/api v0.33.2 k8s.io/apiextensions-apiserver v0.32.2 - k8s.io/apimachinery v0.32.2 + k8s.io/apimachinery v0.33.2 k8s.io/apiserver v0.32.2 k8s.io/cli-runtime v0.32.2 - k8s.io/client-go v0.32.2 + k8s.io/client-go v0.33.2 k8s.io/component-base v0.32.2 k8s.io/klog/v2 v2.130.1 k8s.io/kubectl v0.32.2 @@ -84,14 +84,12 @@ require ( github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect - github.com/golang/protobuf v1.5.4 // indirect - github.com/google/btree v1.0.1 // indirect - github.com/google/gnostic-models v0.6.8 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/gofuzz v1.2.0 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/gorilla/mux v1.8.1 // indirect - github.com/gorilla/websocket v1.5.3 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/gosuri/uitable v0.0.4 // indirect github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect @@ -169,11 +167,12 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect + k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect oras.land/oras-go v1.2.5 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect sigs.k8s.io/kustomize/api v0.18.0 // indirect sigs.k8s.io/kustomize/kyaml v0.18.1 // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect ) diff --git a/go.sum b/go.sum index 548a490a..fc7eaaa8 100644 --- a/go.sum +++ b/go.sum @@ -192,18 +192,16 @@ github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6 github.com/golangci/lint-1 v0.0.0-20181222135242-d2cdd8c08219/go.mod h1:/X8TswGSh1pIozq4ZwCfxS0WA5JGXguxk94ar/4c87Y= github.com/gomodule/redigo v1.8.2 h1:H5XSIre1MB5NbPYFp+i1NBbb5qN1W8Y8YAQoAYbkm8k= github.com/gomodule/redigo v1.8.2/go.mod h1:P9dn9mFrCBvWhGE1wpxx6fgq7BAeLBk+UUUzlpkBYO0= -github.com/google/btree v1.0.1 h1:gK4Kx5IaGY9CD5sPJ36FHiBJ6ZXl0kilRiiCj+jdYp4= -github.com/google/btree v1.0.1/go.mod h1:xXMiIv4Fb/0kKde4SpL7qlzvu5cMJDRkFDxJfI9uaxA= -github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= -github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= +github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= -github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= @@ -216,8 +214,8 @@ github.com/gorilla/handlers v1.5.1/go.mod h1:t8XrUpc4KVXb7HGyJ4/cEnwQiaxrX/hz1Zv github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= -github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= -github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY= github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= @@ -616,24 +614,24 @@ gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g= helm.sh/helm/v3 v3.17.3 h1:3n5rW3D0ArjFl0p4/oWO8IbY/HKaNNwJtOQFdH2AZHg= helm.sh/helm/v3 v3.17.3/go.mod h1:+uJKMH/UiMzZQOALR3XUf3BLIoczI2RKKD6bMhPh4G8= -k8s.io/api v0.32.2 h1:bZrMLEkgizC24G9eViHGOPbW+aRo9duEISRIJKfdJuw= -k8s.io/api v0.32.2/go.mod h1:hKlhk4x1sJyYnHENsrdCWw31FEmCijNGPJO5WzHiJ6Y= +k8s.io/api v0.33.2 h1:YgwIS5jKfA+BZg//OQhkJNIfie/kmRsO0BmNaVSimvY= +k8s.io/api v0.33.2/go.mod h1:fhrbphQJSM2cXzCWgqU29xLDuks4mu7ti9vveEnpSXs= k8s.io/apiextensions-apiserver v0.32.2 h1:2YMk285jWMk2188V2AERy5yDwBYrjgWYggscghPCvV4= k8s.io/apiextensions-apiserver v0.32.2/go.mod h1:GPwf8sph7YlJT3H6aKUWtd0E+oyShk/YHWQHf/OOgCA= -k8s.io/apimachinery v0.32.2 h1:yoQBR9ZGkA6Rgmhbp/yuT9/g+4lxtsGYwW6dR6BDPLQ= -k8s.io/apimachinery v0.32.2/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= +k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= k8s.io/apiserver v0.32.2 h1:WzyxAu4mvLkQxwD9hGa4ZfExo3yZZaYzoYvvVDlM6vw= k8s.io/apiserver v0.32.2/go.mod h1:PEwREHiHNU2oFdte7BjzA1ZyjWjuckORLIK/wLV5goM= k8s.io/cli-runtime v0.32.2 h1:aKQR4foh9qeyckKRkNXUccP9moxzffyndZAvr+IXMks= k8s.io/cli-runtime v0.32.2/go.mod h1:a/JpeMztz3xDa7GCyyShcwe55p8pbcCVQxvqZnIwXN8= -k8s.io/client-go v0.32.2 h1:4dYCD4Nz+9RApM2b/3BtVvBHw54QjMFUl1OLcJG5yOA= -k8s.io/client-go v0.32.2/go.mod h1:fpZ4oJXclZ3r2nDOv+Ux3XcJutfrwjKTCHz2H3sww94= +k8s.io/client-go v0.33.2 h1:z8CIcc0P581x/J1ZYf4CNzRKxRvQAwoAolYPbtQes+E= +k8s.io/client-go v0.33.2/go.mod h1:9mCgT4wROvL948w6f6ArJNb7yQd7QsvqavDeZHvNmHo= k8s.io/component-base v0.32.2 h1:1aUL5Vdmu7qNo4ZsE+569PV5zFatM9hl+lb3dEea2zU= k8s.io/component-base v0.32.2/go.mod h1:PXJ61Vx9Lg+P5mS8TLd7bCIr+eMJRQTyXe8KvkrvJq0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= -k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= +k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= +k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= k8s.io/kubectl v0.32.2 h1:TAkag6+XfSBgkqK9I7ZvwtF0WVtUAvK8ZqTt+5zi1Us= k8s.io/kubectl v0.32.2/go.mod h1:+h/NQFSPxiDZYX/WZaWw9fwYezGLISP0ud8nQKg+3g8= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= @@ -648,7 +646,10 @@ sigs.k8s.io/kustomize/api v0.18.0 h1:hTzp67k+3NEVInwz5BHyzc9rGxIauoXferXyjv5lWPo sigs.k8s.io/kustomize/api v0.18.0/go.mod h1:f8isXnX+8b+SGLHQ6yO4JG1rdkZlvhaCf/uZbLVMb0U= sigs.k8s.io/kustomize/kyaml v0.18.1 h1:WvBo56Wzw3fjS+7vBjN6TeivvpbW9GmRaWZ9CIVmt4E= sigs.k8s.io/kustomize/kyaml v0.18.1/go.mod h1:C3L2BFVU1jgcddNBE1TxuVLgS46TjObMwW5FT9FcjYo= -sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA= -sigs.k8s.io/structured-merge-diff/v4 v4.4.2/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= +sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/internal/actions/check_node_deleted.go b/internal/actions/check_node_deleted.go index 81fd1f8b..2d32afde 100644 --- a/internal/actions/check_node_deleted.go +++ b/internal/actions/check_node_deleted.go @@ -8,8 +8,6 @@ import ( "time" "github.com/sirupsen/logrus" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "github.com/castai/cluster-controller/internal/castai" @@ -43,6 +41,9 @@ type CheckNodeDeletedHandler struct { var errNodeNotDeleted = errors.New("node is not deleted") func (h *CheckNodeDeletedHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil { + return fmt.Errorf("action is nil %w", errAction) + } req, ok := action.Data().(*castai.ActionCheckNodeDeleted) if !ok { return newUnexpectedTypeErr(action.Data(), req) @@ -52,8 +53,17 @@ func (h *CheckNodeDeletedHandler) Handle(ctx context.Context, action *castai.Clu "node_name": req.NodeName, "node_id": req.NodeID, "type": reflect.TypeOf(action.Data().(*castai.ActionCheckNodeDeleted)).String(), + "provider_id": req.ProviderId, ActionIDLogField: action.ID, }) + + log.Info("checking if node is deleted") + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name %v or node ID: %v or provider ID: %v is empty %w", + req.NodeName, req.NodeID, req.ProviderId, errAction) + } + log.Info("checking if node is deleted") boff := waitext.NewConstantBackoff(h.cfg.retryWait) @@ -63,34 +73,7 @@ func (h *CheckNodeDeletedHandler) Handle(ctx context.Context, action *castai.Clu boff, h.cfg.retries, func(ctx context.Context) (bool, error) { - n, err := h.clientset.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{}) - if apierrors.IsNotFound(err) { - return false, nil - } - - if n == nil { - return false, nil - } - - currentNodeID, ok := n.Labels[castai.LabelNodeID] - if !ok { - log.Info("node doesn't have castai node id label") - } - if currentNodeID != "" { - if currentNodeID != req.NodeID { - log.Info("node name was reused. Original node is deleted") - return false, nil - } - if currentNodeID == req.NodeID { - return false, fmt.Errorf("current node id = request node ID %w", errNodeNotDeleted) - } - } - - if n != nil { - return false, errNodeNotDeleted - } - - return true, err + return checkNodeDeleted(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) }, func(err error) { log.Warnf("node deletion check failed, will retry: %v", err) diff --git a/internal/actions/check_node_deleted_test.go b/internal/actions/check_node_deleted_test.go new file mode 100644 index 00000000..0a6880d7 --- /dev/null +++ b/internal/actions/check_node_deleted_test.go @@ -0,0 +1,222 @@ +package actions + +import ( + "context" + "testing" + + "github.com/google/uuid" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + + "github.com/castai/cluster-controller/internal/castai" +) + +func TestCheckNodeDeletedHandler_Handle(t *testing.T) { + t.Parallel() + type fields struct { + cfg checkNodeDeletedConfig + } + type args struct { + action *castai.ClusterAction + tuneFakeObjects []runtime.Object + } + tests := []struct { + name string + fields fields + args args + wantErr error + }{ + { + name: "nil action", + wantErr: errAction, + }, + { + name: "wrong action type", + args: args{ + action: &castai.ClusterAction{}, + }, + wantErr: errAction, + }, + { + name: "empty node name", + args: args{ + action: newActionCheckNodeDeleted("", nodeID, providerID), + }, + wantErr: errAction, + }, + { + name: "nodeID is not matching", + args: args{ + action: newActionCheckNodeDeleted(nodeName, nodeID, providerID), + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: "another-node-id", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + }, + }, + }, + { + name: "provider is not matching", + args: args{ + action: newActionCheckNodeDeleted(nodeName, nodeID, providerID), + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "another-provider-id", + }, + }, + }, + }, + }, + { + name: "provider id of Node is empty but nodeID matches", + args: args{ + action: newActionCheckNodeDeleted(nodeName, nodeID, providerID), + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + }, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "provider id of request is empty but nodeID matches", + args: args{ + action: newActionCheckNodeDeleted(nodeName, nodeID, ""), + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "node id at label is empty but provider ID matches", + args: args{ + action: newActionCheckNodeDeleted(nodeName, nodeID, providerID), + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{}, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "node id at request is empty but provider ID matches", + args: args{ + action: newActionCheckNodeDeleted(nodeName, "", providerID), + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "node found and matches IDs", + args: args{ + action: newActionCheckNodeDeleted(nodeName, nodeID, providerID), + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "handle check successfully when node is not found", + args: args{ + action: newActionCheckNodeDeleted(nodeName, nodeID, providerID), + }, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + clientSet := fake.NewClientset(tt.args.tuneFakeObjects...) + h := &CheckNodeDeletedHandler{ + log: logrus.New(), + clientset: clientSet, + cfg: tt.fields.cfg, + } + err := h.Handle(context.Background(), tt.args.action) + require.Equal(t, tt.wantErr != nil, err != nil, "Handle() error = %v, wantErr %v", err, tt.wantErr) + if tt.wantErr != nil { + require.ErrorIs(t, err, tt.wantErr, "Handle() error mismatch") + } + }) + } +} + +func newActionCheckNodeDeleted(nodeName, nodeID, providerID string) *castai.ClusterAction { + return &castai.ClusterAction{ + ID: uuid.New().String(), + ActionCheckNodeDeleted: &castai.ActionCheckNodeDeleted{ + NodeName: nodeName, + ProviderId: providerID, + NodeID: nodeID, + }, + } +} diff --git a/internal/actions/check_node_handler_test.go b/internal/actions/check_node_handler_test.go deleted file mode 100644 index 00a99f3b..00000000 --- a/internal/actions/check_node_handler_test.go +++ /dev/null @@ -1,65 +0,0 @@ -package actions - -import ( - "context" - "testing" - - "github.com/google/uuid" - "github.com/sirupsen/logrus" - "github.com/stretchr/testify/require" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" - - "github.com/castai/cluster-controller/internal/castai" -) - -//nolint:goconst -func TestCheckNodeDeletedHandler(t *testing.T) { - r := require.New(t) - - log := logrus.New() - log.SetLevel(logrus.DebugLevel) - - t.Run("return error when node is not deleted", func(t *testing.T) { - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - }, - } - clientset := fake.NewSimpleClientset(node) - - h := CheckNodeDeletedHandler{ - log: log, - clientset: clientset, - cfg: checkNodeDeletedConfig{}, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeDeleted: &castai.ActionCheckNodeDeleted{NodeName: "node1"}, - } - - err := h.Handle(context.Background(), action) - r.EqualError(err, "node is not deleted") - }) - - t.Run("handle check successfully when node is not found", func(t *testing.T) { - clientset := fake.NewSimpleClientset() - - h := CheckNodeDeletedHandler{ - log: log, - clientset: clientset, - cfg: checkNodeDeletedConfig{}, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeDeleted: &castai.ActionCheckNodeDeleted{NodeName: "node1"}, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - }) -} diff --git a/internal/actions/check_node_status.go b/internal/actions/check_node_status.go index 044ac4f2..e423d3d2 100644 --- a/internal/actions/check_node_status.go +++ b/internal/actions/check_node_status.go @@ -2,15 +2,17 @@ package actions import ( "context" + "errors" "fmt" "reflect" "time" + "github.com/samber/lo" "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/typed/core/v1" "github.com/castai/cluster-controller/internal/castai" "github.com/castai/cluster-controller/internal/waitext" @@ -31,6 +33,9 @@ type CheckNodeStatusHandler struct { } func (h *CheckNodeStatusHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil { + return fmt.Errorf("action is nil %w", errAction) + } req, ok := action.Data().(*castai.ActionCheckNodeStatus) if !ok { return newUnexpectedTypeErr(action.Data(), req) @@ -39,11 +44,18 @@ func (h *CheckNodeStatusHandler) Handle(ctx context.Context, action *castai.Clus log := h.log.WithFields(logrus.Fields{ "node_name": req.NodeName, "node_id": req.NodeID, + "provider_id": req.ProviderId, "node_status": req.NodeStatus, "type": reflect.TypeOf(action.Data().(*castai.ActionCheckNodeStatus)).String(), ActionIDLogField: action.ID, }) + log.Info("checking status of node") + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) + } + switch req.NodeStatus { case castai.ActionCheckNodeStatus_READY: log.Info("checking node ready") @@ -71,52 +83,51 @@ func (h *CheckNodeStatusHandler) checkNodeDeleted(ctx context.Context, log *logr b, waitext.Forever, func(ctx context.Context) (bool, error) { - n, err := h.clientset.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{}) - if apierrors.IsNotFound(err) { - return false, nil - } + return checkNodeDeleted(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) + }, + func(err error) { + log.Warnf("check node %s status failed, will retry: %v", req.NodeName, err) + }, + ) +} - // If node is nil - deleted - // If label is present and doesn't match - node was reused - deleted - // If label is present and matches - node is not deleted - // If label is not present and node is not nil - node is not deleted (potentially corrupted state). +func checkNodeDeleted(ctx context.Context, clientSet v1.NodeInterface, nodeName, nodeID, providerID string, log logrus.FieldLogger) (bool, error) { + // If node is nil - deleted + // If providerID or label have mismatch, then it's reused and deleted + // If label is present and matches - node is not deleted + // All other use cases can be found in tests + n, err := getNodeByIDs(ctx, clientSet, nodeName, nodeID, providerID, log) + if errors.Is(err, errNodeDoesNotMatch) { + // it means that node with given name exists, but it does not match requested node ID or provider ID. + return false, nil + } - if n == nil { - return false, nil - } + if errors.Is(err, errNodeNotFound) { + return false, nil + } - currentNodeID, ok := n.Labels[castai.LabelNodeID] - if !ok { - log.Info("node doesn't have castai node id label") - } - if currentNodeID != "" { - if currentNodeID != req.NodeID { - log.Info("node name was reused. Original node is deleted") - return false, nil - } - if currentNodeID == req.NodeID { - return false, fmt.Errorf("current node id is equal to requested node id: %v %w", req.NodeID, errNodeNotDeleted) - } - } + if err != nil { + return true, err + } - if n != nil { - return false, errNodeNotDeleted - } + if n == nil { + return false, nil + } - return true, err - }, - func(err error) { - h.log.Warnf("check node %s status failed, will retry: %v", req.NodeName, err) - }, - ) + return false, errNodeNotDeleted } func (h *CheckNodeStatusHandler) checkNodeReady(ctx context.Context, _ *logrus.Entry, req *castai.ActionCheckNodeStatus) error { timeout := 9 * time.Minute - watchObject := metav1.SingleObject(metav1.ObjectMeta{Name: req.NodeName}) if req.WaitTimeoutSeconds != nil { timeout = time.Duration(*req.WaitTimeoutSeconds) * time.Second } + + watchObject := metav1.SingleObject(metav1.ObjectMeta{ + Name: req.NodeName, + }) + watchObject.TimeoutSeconds = lo.ToPtr(int64(timeout.Seconds())) + ctx, cancel := context.WithTimeout(ctx, timeout) defer cancel() @@ -124,27 +135,37 @@ func (h *CheckNodeStatusHandler) checkNodeReady(ctx context.Context, _ *logrus.E if err != nil { return fmt.Errorf("creating node watch: %w", err) } - defer watch.Stop() - for r := range watch.ResultChan() { - if node, ok := r.Object.(*corev1.Node); ok { - if isNodeReady(node, req.NodeID) { - return nil + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("node %s request timeout: %v %w", req.NodeName, timeout, ctx.Err()) + case r, ok := <-watch.ResultChan(): + if !ok { + return fmt.Errorf("node %s request timeout: %v %w", req.NodeName, timeout, errNodeWatcherClosed) + } + if node, ok := r.Object.(*corev1.Node); ok { + if h.isNodeReady(node, req.NodeID, req.ProviderId) { + return nil + } } } } - - return fmt.Errorf("timeout waiting for node %s to become ready", req.NodeName) } -func isNodeReady(node *corev1.Node, castNodeID string) bool { +func (h *CheckNodeStatusHandler) isNodeReady(node *corev1.Node, castNodeID, providerID string) bool { // if node has castai node id label, check if it matches the one we are waiting for // if it doesn't match, we can skip this node. - if val, ok := node.Labels[castai.LabelNodeID]; ok { - if val != "" && val != castNodeID { - return false - } + if err := isNodeIDProviderIDValid(node, castNodeID, providerID, h.log); err != nil { + h.log.WithFields(logrus.Fields{ + "node": node.Name, + "node_id": castNodeID, + "provider_id": providerID, + }).Warnf("node does not match requested node ID or provider ID: %v", err) + return false } + for _, cond := range node.Status.Conditions { if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue && !containsUninitializedNodeTaint(node.Spec.Taints) { return true diff --git a/internal/actions/check_node_status_test.go b/internal/actions/check_node_status_test.go index 643ee66c..331a49e6 100644 --- a/internal/actions/check_node_status_test.go +++ b/internal/actions/check_node_status_test.go @@ -2,16 +2,16 @@ package actions import ( "context" - "errors" - "sync" "testing" - "time" "github.com/google/uuid" + "github.com/samber/lo" "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes/fake" k8stest "k8s.io/client-go/testing" @@ -19,369 +19,459 @@ import ( "github.com/castai/cluster-controller/internal/castai" ) -func TestCheckStatus_Deleted(t *testing.T) { - log := logrus.New() - log.SetLevel(logrus.DebugLevel) +const ( + nodeName = "node1" + nodeID = "node-id" + providerID = "aws:///us-east-1" + podName = "pod1" +) - t.Run("return error when node is not deleted", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{ - castai.LabelNodeID: "old-node-id", +func TestCheckNodeStatusHandler_Handle_Deleted(t *testing.T) { + t.Parallel() + type fields struct { + tuneFakeObjects []runtime.Object + } + type args struct { + action *castai.ClusterAction + } + + nodeObject := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + } + + tests := []struct { + name string + fields fields + args args + wantErr error + }{ + { + name: "action is nil", + wantErr: errAction, + }, + { + name: "return error when action data with wrong action type", + args: args{ + action: &castai.ClusterAction{ + ActionDrainNode: &castai.ActionDrainNode{}, }, }, - } - clientset := fake.NewClientset(node) - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_DELETED, - NodeID: "old-node-id", - }, - } - - err := h.Handle(context.Background(), action) - r.True(errors.Is(err, errNodeNotDeleted)) - }) - - t.Run("return error when node is not deleted with no label (backwards compatibility)", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - }, - } - clientset := fake.NewClientset(node) - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_DELETED, - NodeID: "old-node-id", - }, - } - - err := h.Handle(context.Background(), action) - r.EqualError(err, "node is not deleted") - }) - - t.Run("handle check successfully when node is not found", func(t *testing.T) { - r := require.New(t) - clientset := fake.NewClientset() - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_DELETED, - NodeID: "old-node-id", - }, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - }) - - t.Run("handle check successfully when node name was reused but id mismatch", func(t *testing.T) { - r := require.New(t) - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - Labels: map[string]string{ - castai.LabelNodeID: "old-node-id", + wantErr: errAction, + }, + { + name: "provider is not matching", + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, "another-provider-id", castai.ActionCheckNodeStatus_DELETED, nil), + }, + fields: fields{ + tuneFakeObjects: []runtime.Object{ + nodeObject, }, }, - } - clientset := fake.NewClientset(node) - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_DELETED, - NodeID: "im-a-different-node", - }, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - }) + }, + { + name: "provider id of Node is empty but nodeID matches", + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, nil), + }, + fields: fields{ + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + }, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "provider id of request is empty but nodeID matches", + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, "", castai.ActionCheckNodeStatus_DELETED, nil), + }, + fields: fields{ + tuneFakeObjects: []runtime.Object{ + nodeObject, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "node id at label is empty but provider ID matches", + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, nil), + }, + fields: fields{ + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{}, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "node id at request is empty but provider ID matches", + args: args{ + action: newActionCheckNodeStatus(nodeName, "", providerID, castai.ActionCheckNodeStatus_DELETED, nil), + }, + fields: fields{ + tuneFakeObjects: []runtime.Object{ + nodeObject, + }, + }, + wantErr: errNodeNotDeleted, + }, + { + name: "node with the same name exists but IDs does not match", + fields: fields{ + tuneFakeObjects: []runtime.Object{ + nodeObject, + }, + }, + args: args{ + action: newActionCheckNodeStatus(nodeName, "another-node-id", "another-provider-id", castai.ActionCheckNodeStatus_DELETED, nil), + }, + }, + { + name: "node with the same name exists but node id does not match (provider matches)", + fields: fields{ + tuneFakeObjects: []runtime.Object{ + nodeObject, + }, + }, + args: args{ + action: newActionCheckNodeStatus(nodeName, "another-node-id", providerID, castai.ActionCheckNodeStatus_DELETED, nil), + }, + }, + { + name: "handle check successfully when node is not found", + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, nil), + }, + }, + { + name: "return error when node is not deleted", + fields: fields{ + tuneFakeObjects: []runtime.Object{ + nodeObject, + }, + }, + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, nil), + }, + wantErr: errNodeNotDeleted, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + clientSet := fake.NewClientset(tt.fields.tuneFakeObjects...) + log := logrus.New() + log.SetLevel(logrus.DebugLevel) + h := NewCheckNodeStatusHandler( + log, clientSet) + err := h.Handle(context.Background(), tt.args.action) + require.ErrorIs(t, err, tt.wantErr, "unexpected error: %v", err) + }) + } } -func TestCheckStatus_Ready(t *testing.T) { - log := logrus.New() - log.SetLevel(logrus.DebugLevel) - - t.Run("return error when node is not found", func(t *testing.T) { - r := require.New(t) - clientset := fake.NewClientset() - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - watcher := watch.NewFake() - - clientset.PrependWatchReactor("nodes", k8stest.DefaultWatchReactor(watcher, nil)) - go func() { - time.Sleep(time.Second) - watcher.Stop() - }() - - timeout := int32(1) - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_READY, - WaitTimeoutSeconds: &timeout, - }, - } - - err := h.Handle(context.Background(), action) - r.EqualError(err, "timeout waiting for node node1 to become ready") - }) - - t.Run("handle check successfully when node become ready", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ +func TestCheckNodeStatusHandler_Handle_Ready(t *testing.T) { + t.Parallel() + type tuneFakeObjects struct { + event watch.EventType + object runtime.Object + } + type fields struct { + tuneFakeObjects []tuneFakeObjects + } + type args struct { + action *castai.ClusterAction + } + + nodeUID := types.UID(uuid.New().String()) + var nodeObjectNotReady, nodeObjectReady, nodeObjectReadyTainted, node2ObjectReadyAnotherNodeID runtime.Object + nodeObjectNotReady = &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + UID: nodeUID, + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{}, + }, + } + nodeObjectReadyTainted = &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + UID: nodeUID, + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + Taints: []v1.Taint{taintCloudProviderUninitialized}, + ProviderID: providerID, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }, + }, + }, + } + + nodeObjectReady = &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + UID: nodeUID, + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + } + + node2ObjectReadyAnotherNodeID = &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + UID: types.UID(uuid.New().String()), + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: "another-node-id", + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + } + + tests := []struct { + name string + fields fields + args args + wantErr error + }{ + { + name: "action is nil", + wantErr: errAction, + }, + { + name: "empty node name", + args: args{ + action: newActionCheckNodeStatus("", nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(1))), + }, + wantErr: errAction, + }, + { + name: "return error when ctx timeout", + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(1))), + }, + wantErr: context.DeadlineExceeded, + }, + { + name: "return error when ctx timeout: node not ready", + fields: fields{ + tuneFakeObjects: []tuneFakeObjects{ { - Type: v1.NodeReady, - Status: v1.ConditionFalse, + event: watch.Modified, + object: nodeObjectNotReady, }, }, }, - } - clientset := fake.NewClientset(node) - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - timeout := int32(60) - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_READY, - WaitTimeoutSeconds: &timeout, - }, - } - - var wg sync.WaitGroup - wg.Add(2) - var err error - go func() { - err = h.Handle(context.Background(), action) - wg.Done() - }() - - go func() { - time.Sleep(1 * time.Second) - node.Status.Conditions[0].Status = v1.ConditionTrue - _, _ = clientset.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{}) - wg.Done() - }() - wg.Wait() - - r.NoError(err) - }) - - t.Run("handle check successfully when node become ready - removed taint", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(2))), + }, + wantErr: context.DeadlineExceeded, + }, + { + name: "return error when ctx timeout: node is ready but has different match ID", + fields: fields{ + tuneFakeObjects: []tuneFakeObjects{ { - Type: v1.NodeReady, - Status: v1.ConditionTrue, + event: watch.Modified, + object: node2ObjectReadyAnotherNodeID, + }, + { + event: watch.Modified, + object: node2ObjectReadyAnotherNodeID, }, }, }, - Spec: v1.NodeSpec{ - Taints: []v1.Taint{taintCloudProviderUninitialized}, + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(2))), }, - } - clientset := fake.NewClientset(node) - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - timeout := int32(60) - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_READY, - WaitTimeoutSeconds: &timeout, - }, - } - - var wg sync.WaitGroup - wg.Add(2) - var err error - go func() { - err = h.Handle(context.Background(), action) - wg.Done() - }() - - go func() { - time.Sleep(1 * time.Second) - node.Spec.Taints = []v1.Taint{} - _, _ = clientset.CoreV1().Nodes().Update(context.Background(), node, metav1.UpdateOptions{}) - wg.Done() - }() - wg.Wait() - - r.NoError(err) - }) - - t.Run("handle error when node is not ready", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{}, - }, - } - clientset := fake.NewClientset(node) - watcher := watch.NewFake() - - clientset.PrependWatchReactor("nodes", k8stest.DefaultWatchReactor(watcher, nil)) - go func() { - time.Sleep(time.Second) - watcher.Stop() - }() - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_READY, + wantErr: context.DeadlineExceeded, + }, + { + name: "return error when ctx timeout: node is ready but tainted", + fields: fields{ + tuneFakeObjects: []tuneFakeObjects{ + { + event: watch.Modified, + object: nodeObjectReadyTainted, + }, + { + event: watch.Modified, + object: node2ObjectReadyAnotherNodeID, + }, + }, }, - } - - err := h.Handle(context.Background(), action) - r.Error(err) - r.EqualError(err, "timeout waiting for node node1 to become ready") - }) - - t.Run("handle check successfully when reusing node names happens and node is replaced", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{ - castai.LabelNodeID: "old-node-id", + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(2))), + }, + wantErr: context.DeadlineExceeded, + }, + { + name: "handle check successfully when node become ready", + fields: fields{ + tuneFakeObjects: []tuneFakeObjects{ + { + event: watch.Modified, + object: nodeObjectNotReady, + }, + { + event: watch.Modified, + object: nodeObjectReadyTainted, + }, + { + event: watch.Modified, + object: nodeObjectReady, + }, }, }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(10))), + }, + }, + { + name: "handle check successfully when node become ready: request with empty provider ID", + fields: fields{ + tuneFakeObjects: []tuneFakeObjects{ + { + event: watch.Modified, + object: nodeObjectNotReady, + }, { - Type: v1.NodeReady, - Status: v1.ConditionFalse, + event: watch.Modified, + object: nodeObjectReadyTainted, + }, + { + event: watch.Modified, + object: nodeObjectReady, }, }, }, - } - clientset := fake.NewClientset(node) - - h := CheckNodeStatusHandler{ - log: log, - clientset: clientset, - } - - timeout := int32(60) - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ - NodeName: "node1", - NodeStatus: castai.ActionCheckNodeStatus_READY, - WaitTimeoutSeconds: &timeout, - NodeID: "new-node-id", - }, - } - - // simulate node replacement - // 1. node is deleted - // 2. new node is created with the same name and different id - // 3. node is ready - // 4. checkNodeStatusHandler.Handle() is called. - var wg sync.WaitGroup - wg.Add(2) - var err error - go func() { - err = h.Handle(context.Background(), action) - wg.Done() - }() - - go func() { - time.Sleep(1 * time.Second) - _ = clientset.CoreV1().Nodes().Delete(context.Background(), nodeName, metav1.DeleteOptions{}) - - time.Sleep(1 * time.Second) - newNode := node.DeepCopy() - newNode.Labels[castai.LabelNodeID] = "new-node-id" - - _, _ = clientset.CoreV1().Nodes().Create(context.Background(), newNode, metav1.CreateOptions{}) - - time.Sleep(5 * time.Second) - newNode.Status.Conditions[0].Status = v1.ConditionTrue - _, _ = clientset.CoreV1().Nodes().UpdateStatus(context.Background(), newNode, metav1.UpdateOptions{}) - wg.Done() - }() - wg.Wait() + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, "", castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(10))), + }, + }, + { + name: "handle check successfully when node become ready - removed taint", + fields: fields{ + tuneFakeObjects: []tuneFakeObjects{ + { + event: watch.Modified, + object: nodeObjectReadyTainted, + }, + { + event: watch.Modified, + object: nodeObjectReady, + }, + }, + }, + args: args{ + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(1))), + }, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + clientSet := fake.NewClientset() + watcher := watch.NewFake() + defer watcher.Stop() + + go func() { + if len(tt.fields.tuneFakeObjects) == 0 { + return + } + watcher.Add(nodeObjectNotReady) + watcher.Add(node2ObjectReadyAnotherNodeID) + for _, obj := range tt.fields.tuneFakeObjects { + watcher.Action(obj.event, obj.object) + } + }() + clientSet.PrependWatchReactor("nodes", k8stest.DefaultWatchReactor(watcher, nil)) + + log := logrus.New() + log.SetLevel(logrus.DebugLevel) + h := NewCheckNodeStatusHandler(log, clientSet) + + err := h.Handle(context.Background(), tt.args.action) + require.ErrorIs(t, err, tt.wantErr, "unexpected error: %v", err) + }) + } +} - r.NoError(err) - }) +func newActionCheckNodeStatus(nodeName, nodeID, providerID string, status castai.ActionCheckNodeStatus_Status, timeout *int32) *castai.ClusterAction { + return &castai.ClusterAction{ + ID: uuid.New().String(), + ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ + NodeName: nodeName, + NodeID: nodeID, + ProviderId: providerID, + NodeStatus: status, + WaitTimeoutSeconds: timeout, + }, + } } diff --git a/internal/actions/delete_node_handler.go b/internal/actions/delete_node_handler.go index 6137a573..684260a6 100644 --- a/internal/actions/delete_node_handler.go +++ b/internal/actions/delete_node_handler.go @@ -27,8 +27,6 @@ type deleteNodeConfig struct { podsTerminationWait time.Duration } -var errNodeMismatch = errors.New("node id mismatch") - func NewDeleteNodeHandler(log logrus.FieldLogger, clientset kubernetes.Interface) *DeleteNodeHandler { return &DeleteNodeHandler{ log: log, @@ -53,6 +51,9 @@ type DeleteNodeHandler struct { } func (h *DeleteNodeHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil { + return fmt.Errorf("action is nil %w", errAction) + } req, ok := action.Data().(*castai.ActionDeleteNode) if !ok { return newUnexpectedTypeErr(action.Data(), req) @@ -61,10 +62,16 @@ func (h *DeleteNodeHandler) Handle(ctx context.Context, action *castai.ClusterAc log := h.log.WithFields(logrus.Fields{ "node_name": req.NodeName, "node_id": req.NodeID, + "provider_id": req.ProviderId, "type": reflect.TypeOf(action.Data().(*castai.ActionDeleteNode)).String(), ActionIDLogField: action.ID, }) + log.Info("deleting kubernetes node") + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) + } b := waitext.NewConstantBackoff(h.cfg.deleteRetryWait) err := waitext.Retry( @@ -72,26 +79,22 @@ func (h *DeleteNodeHandler) Handle(ctx context.Context, action *castai.ClusterAc b, h.cfg.deleteRetries, func(ctx context.Context) (bool, error) { - current, err := h.clientset.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{}) + current, err := getNodeByIDs(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) if err != nil { - if apierrors.IsNotFound(err) { - log.Info("node not found, skipping delete") - return false, nil + if errors.Is(err, errNodeNotFound) || errors.Is(err, errNodeDoesNotMatch) { + return false, err } - return true, fmt.Errorf("error getting node: %w", err) - } - if val, ok := current.Labels[castai.LabelNodeID]; ok { - if val != "" && val != req.NodeID { - log.Infof("node id mismatch, expected %q got %q. Skipping delete.", req.NodeID, val) - return true, errNodeMismatch - } + return true, err } - err = h.clientset.CoreV1().Nodes().Delete(ctx, current.Name, metav1.DeleteOptions{}) + err = h.clientset.CoreV1().Nodes().Delete(ctx, current.Name, metav1.DeleteOptions{ + Preconditions: &metav1.Preconditions{ + UID: ¤t.UID, + }, + }) if apierrors.IsNotFound(err) { - log.Info("node not found, skipping delete") - return false, nil + return false, errNodeNotFound } return true, err }, @@ -100,10 +103,13 @@ func (h *DeleteNodeHandler) Handle(ctx context.Context, action *castai.ClusterAc }, ) - if errors.Is(err, errNodeMismatch) { + if errors.Is(err, errNodeNotFound) || errors.Is(err, errNodeDoesNotMatch) { + log.Infof("node already deleted or does not match the requested node ID/provider ID %v", err) return nil } + if err != nil { + log.Errorf("error deleting kubernetes node: %v", err) return fmt.Errorf("error removing node %w", err) } diff --git a/internal/actions/delete_node_handler_test.go b/internal/actions/delete_node_handler_test.go index 4d3e9fe0..07611873 100644 --- a/internal/actions/delete_node_handler_test.go +++ b/internal/actions/delete_node_handler_test.go @@ -4,157 +4,181 @@ import ( "context" "testing" - "github.com/google/uuid" "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" + k8sfields "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/fake" "github.com/castai/cluster-controller/internal/castai" ) -//nolint:goconst -func TestDeleteNodeHandler(t *testing.T) { - log := logrus.New() - log.SetLevel(logrus.DebugLevel) - - t.Run("delete successfully", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, +func TestDeleteNodeHandler_Handle(t *testing.T) { + t.Parallel() + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, }, - } - clientset := fake.NewSimpleClientset(node) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDeleteNode: &castai.ActionDeleteNode{ - NodeName: "node1", + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + } + type fields struct { + tuneFakeObjects []runtime.Object + cfg deleteNodeConfig + } + type args struct { + action *castai.ClusterAction + } + + tests := []struct { + name string + fields fields + args args + wantErr error + wantDeletedNode bool + }{ + { + name: "nil", + args: args{}, + wantErr: errAction, + }, + { + name: "wrong action type", + args: args{ + action: &castai.ClusterAction{ + ActionDrainNode: &castai.ActionDrainNode{}, + }, }, - } - - h := DeleteNodeHandler{ - log: log, - clientset: clientset, - cfg: deleteNodeConfig{}, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - - _, err = clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.Error(err) - r.True(apierrors.IsNotFound(err)) - }) - - t.Run("skip delete when node not found", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, + wantErr: errAction, + }, + { + name: "empty node name", + args: args{ + action: newActionDeleteNode("", nodeID, providerID), }, - } - clientset := fake.NewSimpleClientset(node) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDeleteNode: &castai.ActionDeleteNode{ - NodeName: "already-deleted-node", + wantErr: errAction, + }, + { + name: "empty node ID and provider ID", + args: args{ + action: newActionDeleteNode(nodeName, "", ""), }, - } - - h := DeleteNodeHandler{ - log: log, - clientset: clientset, - cfg: deleteNodeConfig{}, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - - _, err = clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - }) - - t.Run("skip delete when node id do not match", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{ - castai.LabelNodeID: "node-id", - }, + wantErr: errAction, + wantDeletedNode: false, + }, + { + name: "action with another node id and provider id - node not found", + args: args{ + action: newActionDeleteNode(nodeName, "another-node-id", "another-provider-id"), }, - } - clientset := fake.NewSimpleClientset(node) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDeleteNode: &castai.ActionDeleteNode{ - NodeName: "node1", - NodeID: "another-node-id", + wantDeletedNode: false, + }, + { + name: "action with proper node id and another provider id - node not found", + args: args{ + action: newActionDeleteNode(nodeName, nodeID, "another-provider-id"), }, - } - - h := DeleteNodeHandler{ - log: log, - clientset: clientset, - cfg: deleteNodeConfig{}, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - - existing, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - existing.Labels[castai.LabelNodeID] = "node-id" - }) - - t.Run("delete node with pods", func(t *testing.T) { - r := require.New(t) - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDeleteNode: &castai.ActionDeleteNode{ - NodeName: nodeName, + wantDeletedNode: false, + }, + { + name: "action with another node id and proper provider id - node not found", + args: args{ + action: newActionDeleteNode(nodeName, nodeID, "another-provider-id"), }, - } - - h := DeleteNodeHandler{ - log: log, - clientset: clientset, - cfg: deleteNodeConfig{ - podsTerminationWait: 1, + wantDeletedNode: false, + }, + { + name: "node not found", + args: args{ + action: newActionDeleteNode("node-not-found", nodeID, providerID), }, - DrainNodeHandler: DrainNodeHandler{clientset: clientset, log: log}, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - - _, err = clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.Error(err) - r.True(apierrors.IsNotFound(err)) - - pods, err := h.clientset.CoreV1().Pods(metav1.NamespaceAll).List(context.Background(), metav1.ListOptions{ - FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName}).String(), - }) - r.NoError(err) - r.Len(pods.Items, 0) - va, err := h.clientset.StorageV1().VolumeAttachments().List(context.Background(), metav1.ListOptions{ - FieldSelector: fields.SelectorFromSet(fields.Set{}).String(), + wantDeletedNode: false, + }, + { + name: "delete successfully without pods", + args: args{ + action: newActionDeleteNode(nodeName, nodeID, providerID), + }, + fields: fields{ + tuneFakeObjects: []runtime.Object{ + node, + }, + }, + wantDeletedNode: true, + }, + { + name: "delete node with pods", + args: args{ + action: newActionDeleteNode(nodeName, nodeID, providerID), + }, + fields: fields{ + cfg: deleteNodeConfig{ + podsTerminationWait: 1, + }, + }, + wantDeletedNode: true, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + var clientSet kubernetes.Interface + if tt.fields.tuneFakeObjects == nil { + clientSet = setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + } else { + clientSet = fake.NewClientset(tt.fields.tuneFakeObjects...) + } + h := &DeleteNodeHandler{ + DrainNodeHandler: DrainNodeHandler{clientset: clientSet, log: logrus.New()}, + log: logrus.New(), + clientset: clientSet, + cfg: tt.fields.cfg, + } + err := h.Handle(context.Background(), tt.args.action) + require.Equal(t, tt.wantErr != nil, err != nil, "expected error: %v, got: %v", tt.wantErr, err) + if tt.wantErr != nil { + require.ErrorAs(t, err, &tt.wantErr, "Handle() error = %v, wantErr %v", err, tt.wantErr) + } + + if err != nil { + return + } + + _, err = h.clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + if tt.wantDeletedNode { + require.True(t, apierrors.IsNotFound(err), "node should be deleted, but got: %v", err) + pods, err := h.clientset.CoreV1().Pods(metav1.NamespaceAll).List(context.Background(), metav1.ListOptions{ + FieldSelector: k8sfields.SelectorFromSet(k8sfields.Set{"spec.nodeName": nodeName}).String(), + }) + require.NoError(t, err) + require.Len(t, pods.Items, 0) + va, err := h.clientset.StorageV1().VolumeAttachments().List(context.Background(), metav1.ListOptions{ + FieldSelector: k8sfields.SelectorFromSet(k8sfields.Set{}).String(), + }) + require.NoError(t, err) + require.Len(t, va.Items, 0) + } else { + require.NoError(t, err, "node should not be deleted, but got: %v", err) + } }) - r.NoError(err) - r.Len(va.Items, 0) - }) + } +} + +func newActionDeleteNode(nodeName, nodeID, providerID string) *castai.ClusterAction { + return &castai.ClusterAction{ + ActionDeleteNode: &castai.ActionDeleteNode{ + NodeName: nodeName, + NodeID: nodeID, + ProviderId: providerID, + }, + } } diff --git a/internal/actions/drain_node_handler.go b/internal/actions/drain_node_handler.go index ce464879..4ac119b1 100644 --- a/internal/actions/drain_node_handler.go +++ b/internal/actions/drain_node_handler.go @@ -74,6 +74,9 @@ type DrainNodeHandler struct { } func (h *DrainNodeHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil { + return fmt.Errorf("action is nil %w", errAction) + } req, ok := action.Data().(*castai.ActionDrainNode) if !ok { return newUnexpectedTypeErr(action.Data(), req) @@ -83,16 +86,23 @@ func (h *DrainNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct log := h.log.WithFields(logrus.Fields{ "node_name": req.NodeName, "node_id": req.NodeID, + "provider_id": req.ProviderId, "action": reflect.TypeOf(action.Data().(*castai.ActionDrainNode)).String(), ActionIDLogField: action.ID, }) - node, err := h.clientset.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{}) + log.Info("draining kubernetes node") + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) + } + + node, err := getNodeByIDs(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) + if errors.Is(err, errNodeNotFound) || errors.Is(err, errNodeDoesNotMatch) { + log.Info("node not found, skipping draining") + return nil + } if err != nil { - if apierrors.IsNotFound(err) { - log.Info("node not found, skipping draining") - return nil - } return err } diff --git a/internal/actions/drain_node_handler_test.go b/internal/actions/drain_node_handler_test.go index 87a0446b..acd59c9c 100644 --- a/internal/actions/drain_node_handler_test.go +++ b/internal/actions/drain_node_handler_test.go @@ -15,407 +15,13 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/fake" ktest "k8s.io/client-go/testing" "github.com/castai/cluster-controller/internal/castai" ) -func TestDrainNodeHandler(t *testing.T) { - t.Parallel() - r := require.New(t) - - log := logrus.New() - log.SetLevel(logrus.DebugLevel) - - t.Run("drain successfully", func(t *testing.T) { - t.Parallel() - - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - prependEvictionReaction(t, clientset, true, false) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDrainNode: &castai.ActionDrainNode{ - NodeName: "node1", - DrainTimeoutSeconds: 1, - Force: true, - }, - CreatedAt: time.Now().UTC(), - } - h := DrainNodeHandler{ - log: log, - clientset: clientset, - cfg: drainNodeConfig{}, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - - n, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - r.True(n.Spec.Unschedulable) - - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) - r.Error(err) - r.True(apierrors.IsNotFound(err)) - - // Daemon set and static pods and job should not be drained. - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), "ds-pod", metav1.GetOptions{}) - r.NoError(err) - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), "static-pod", metav1.GetOptions{}) - r.NoError(err) - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), "job-pod", metav1.GetOptions{}) - r.NoError(err) - }) - - t.Run("skip drain when node not found", func(t *testing.T) { - t.Parallel() - - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDrainNode: &castai.ActionDrainNode{ - NodeName: "already-deleted-node", - DrainTimeoutSeconds: 1, - Force: true, - }, - CreatedAt: time.Now().UTC(), - } - - h := DrainNodeHandler{ - log: log, - clientset: clientset, - cfg: drainNodeConfig{}, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - - _, err = clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - }) - - t.Run("when eviction fails for a pod and force=false, leaves node cordoned and skip deletion", func(t *testing.T) { - t.Parallel() - - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - prependEvictionReaction(t, clientset, false, false) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDrainNode: &castai.ActionDrainNode{ - NodeName: "node1", - DrainTimeoutSeconds: 1, - Force: false, - }, - CreatedAt: time.Now().UTC(), - } - - h := DrainNodeHandler{ - log: log, - clientset: clientset, - cfg: drainNodeConfig{}, - } - - err := h.Handle(context.Background(), action) - - r.Error(err) - r.ErrorContains(err, "failed to drain via graceful eviction") - - n, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - r.True(n.Spec.Unschedulable) - - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) - r.NoError(err) - }) - - t.Run("when eviction timeout is reached and force=false, leaves node cordoned and skip deletion", func(t *testing.T) { - t.Parallel() - - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - prependEvictionReaction(t, clientset, false, true) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDrainNode: &castai.ActionDrainNode{ - NodeName: "node1", - DrainTimeoutSeconds: 0, - Force: false, - }, - CreatedAt: time.Now().UTC(), - } - - h := DrainNodeHandler{ - log: log, - clientset: clientset, - cfg: drainNodeConfig{}, - } - - err := h.Handle(context.Background(), action) - - r.Error(err) - r.ErrorContains(err, "failed to drain via graceful eviction") - r.ErrorIs(err, context.DeadlineExceeded) - - n, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - r.True(n.Spec.Unschedulable) - - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) - r.NoError(err) - }) - - t.Run("eviction fails and force=true, force remove pods", func(t *testing.T) { - t.Parallel() - - cases := []struct { - name string - drainTimeoutSeconds int - retryablePodEvictionErr bool - }{ - { - name: "timeout during eviction", - drainTimeoutSeconds: 0, - retryablePodEvictionErr: true, - }, - { - name: "failed pod during eviction", - drainTimeoutSeconds: 10, - retryablePodEvictionErr: false, - }, - } - - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - - r := require.New(t) - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - prependEvictionReaction(t, clientset, false, tc.retryablePodEvictionErr) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDrainNode: &castai.ActionDrainNode{ - NodeName: "node1", - DrainTimeoutSeconds: tc.drainTimeoutSeconds, - Force: true, - }, - CreatedAt: time.Now().UTC(), - } - - h := DrainNodeHandler{ - log: log, - clientset: clientset, - cfg: drainNodeConfig{ - podsDeleteTimeout: 700 * time.Millisecond, - podDeleteRetries: 5, - podDeleteRetryDelay: 500 * time.Millisecond, - podEvictRetryDelay: 500 * time.Millisecond, - podsTerminationWaitRetryDelay: 1000 * time.Millisecond, - }, - } - - actualCalls := 0 - clientset.PrependReactor("delete", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { - deleteAction := action.(ktest.DeleteActionImpl) - if deleteAction.Name == podName { - actualCalls++ - // First call should be graceful; simulate it failed to validate we'll do the forced part. - // This relies on us not retrying 404s (or let's say it tests it :) ). - if deleteAction.DeleteOptions.GracePeriodSeconds == nil { - return true, nil, &apierrors.StatusError{ErrStatus: metav1.Status{Reason: metav1.StatusReasonNotFound}} - } - // Second call should be forced. - r.Equal(int64(0), *deleteAction.DeleteOptions.GracePeriodSeconds) - return false, nil, nil - } - return false, nil, nil - }) - - err := h.Handle(context.Background(), action) - r.NoError(err) - r.Equal(2, actualCalls) - - n, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - r.True(n.Spec.Unschedulable) - - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) - r.True(apierrors.IsNotFound(err)) - }) - } - }) - - t.Run("eviction fails and force=true, at least one pod fails to delete due to internal error, should return error", func(t *testing.T) { - t.Parallel() - - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDrainNode: &castai.ActionDrainNode{ - NodeName: "node1", - DrainTimeoutSeconds: 1, - Force: true, - }, - CreatedAt: time.Now().UTC(), - } - - h := DrainNodeHandler{ - log: log, - clientset: clientset, - cfg: drainNodeConfig{ - podsDeleteTimeout: 7 * time.Second, - podDeleteRetries: 5, - podDeleteRetryDelay: 5 * time.Second, - podEvictRetryDelay: 5 * time.Second, - podsTerminationWaitRetryDelay: 10 * time.Second, - }, - } - - clientset.PrependReactor("delete", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { - deleteAction := action.(ktest.DeleteActionImpl) - if deleteAction.Name == podName { - return true, nil, &apierrors.StatusError{ErrStatus: metav1.Status{Reason: metav1.StatusReasonInternalError, Message: "internal"}} - } - return false, nil, nil - }) - - err := h.Handle(context.Background(), action) - - var podFailedDeletionErr *podFailedActionError - - r.ErrorAs(err, &podFailedDeletionErr) - r.Len(podFailedDeletionErr.Errors, 1) - r.Contains(podFailedDeletionErr.Errors[0].Error(), "default/pod1") - r.Equal("delete", podFailedDeletionErr.Action) - - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) - r.NoError(err) - }) - - t.Run("eviction fails and force=true, timeout during deletion should be retried and returned", func(t *testing.T) { - t.Parallel() - - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDrainNode: &castai.ActionDrainNode{ - NodeName: "node1", - DrainTimeoutSeconds: 1, - Force: true, - }, - CreatedAt: time.Now().UTC(), - } - - h := DrainNodeHandler{ - log: log, - clientset: clientset, - cfg: drainNodeConfig{ - podsDeleteTimeout: 0, // Force delete to timeout immediately. - podDeleteRetries: 5, - podDeleteRetryDelay: 5 * time.Second, - podEvictRetryDelay: 5 * time.Second, - podsTerminationWaitRetryDelay: 10 * time.Second, - }, - } - - actualDeleteCalls := 0 - clientset.PrependReactor("delete", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { - deleteAction := action.(ktest.DeleteActionImpl) - if deleteAction.Name == podName { - actualDeleteCalls++ - return true, nil, &apierrors.StatusError{ErrStatus: metav1.Status{Reason: metav1.StatusReasonTooManyRequests, Message: "stop hammering"}} - } - return false, nil, nil - }) - - err := h.Handle(context.Background(), action) - - r.Equal(2, actualDeleteCalls) - r.ErrorIs(err, context.DeadlineExceeded) - - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) - r.NoError(err) - }) - - t.Run("force=true, failed eviction for PDBs should be retried until timeout before deleting", func(t *testing.T) { - t.Parallel() - - // tests specifically that PDB error in eviction is retried and not failed fast. - nodeName := "node1" - podName := "pod1" - clientset := setupFakeClientWithNodePodEviction(nodeName, podName) - - clientset.PrependReactor("create", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { - if action.GetSubresource() != "eviction" { - return false, nil, nil - } - - // PDB error is a bit specific in k8s to reconstruct... - return true, - nil, - &apierrors.StatusError{ErrStatus: metav1.Status{ - Reason: metav1.StatusReasonTooManyRequests, - Details: &metav1.StatusDetails{ - Causes: []metav1.StatusCause{ - { - Type: policyv1.DisruptionBudgetCause, - }, - }, - }, - }} - }) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionDrainNode: &castai.ActionDrainNode{ - NodeName: "node1", - DrainTimeoutSeconds: 2, - Force: false, - }, - CreatedAt: time.Now().UTC(), - } - - h := DrainNodeHandler{ - log: log, - clientset: clientset, - cfg: drainNodeConfig{}, - } - - err := h.Handle(context.Background(), action) - - r.Error(err) - r.ErrorContains(err, "failed to drain via graceful eviction") - r.ErrorIs(err, context.DeadlineExceeded) - - n, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - r.True(n.Spec.Unschedulable) - - _, err = clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) - r.NoError(err) - }) -} - func TestGetDrainTimeout(t *testing.T) { log := logrus.New() log.SetLevel(logrus.DebugLevel) @@ -526,10 +132,16 @@ func prependEvictionReaction(t testing.TB, c *fake.Clientset, success, retryable } // nolint: unparam -func setupFakeClientWithNodePodEviction(nodeName, podName string) *fake.Clientset { +func setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName string) *fake.Clientset { node := &v1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, }, } pod := &v1.Pod{ @@ -628,3 +240,390 @@ func addEvictionSupport(c *fake.Clientset) { c.Resources = append(c.Resources, coreResources) } + +// nolint: gocognit +func TestDrainNodeHandler_Handle(t *testing.T) { + podFailedDeletionErr := &podFailedActionError{} + t.Parallel() + type fields struct { + clientSet func() *fake.Clientset + } + type args struct { + action *castai.ClusterAction + cfg drainNodeConfig + } + tests := []struct { + name string + fields fields + args args + wantErr error + wantErrorContains string + wantPodIsNotFound bool + wantNodeNotCordoned bool + }{ + { + name: "nil", + args: args{}, + fields: fields{ + clientSet: func() *fake.Clientset { + return fake.NewClientset() + }, + }, + wantErr: errAction, + }, + { + name: "wrong action type", + args: args{ + action: &castai.ClusterAction{ + ActionDeleteNode: &castai.ActionDeleteNode{}, + }, + }, + fields: fields{ + clientSet: func() *fake.Clientset { + return fake.NewClientset() + }, + }, + wantErr: errAction, + }, + { + name: "empty node name", + args: args{ + action: newActionDrainNode("", nodeID, providerID, 1, true), + }, + fields: fields{ + clientSet: func() *fake.Clientset { + return setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + }, + }, + wantErr: errAction, + }, + { + name: "empty node ID and provider ID", + args: args{ + action: newPatchNodeAction(nodeName, "", "", + nil, nil, nil, nil, nil), + }, + fields: fields{ + clientSet: func() *fake.Clientset { + return setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + }, + }, + wantErr: errAction, + }, + { + name: "action with another node id and provider id - skip drain", + fields: fields{ + clientSet: func() *fake.Clientset { + return setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + }, + }, + args: args{ + action: newActionDrainNode(nodeName, "another-node-id", "another-provider-id", 1, true), + }, + wantNodeNotCordoned: true, + }, + { + name: "action with proper node id and another provider id - skip drain", + fields: fields{ + clientSet: func() *fake.Clientset { + return setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + }, + }, + args: args{ + action: newActionDrainNode(nodeName, nodeID, "another-provider-id", 1, true), + }, + wantNodeNotCordoned: true, + }, + { + name: "action with another node id and proper provider id - skip drain", + fields: fields{ + clientSet: func() *fake.Clientset { + return setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + }, + }, + args: args{ + action: newActionDrainNode(nodeName, nodeID, "another-provider-id", 1, true), + }, + wantNodeNotCordoned: true, + }, + { + name: "drain node successfully", + fields: fields{ + clientSet: func() *fake.Clientset { + c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + prependEvictionReaction(t, c, true, false) + return c + }, + }, + args: args{ + cfg: drainNodeConfig{}, + action: newActionDrainNode(nodeName, nodeID, providerID, 1, true), + }, + wantPodIsNotFound: true, + }, + { + name: "skip drain when node not found", + fields: fields{ + clientSet: func() *fake.Clientset { + return setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + }, + }, + args: args{ + cfg: drainNodeConfig{}, + action: newActionDrainNode("already-deleted-node", nodeID, providerID, 1, true), + }, + }, + { + name: "when eviction fails for a pod and force=false, leaves node cordoned and skip deletion", + fields: fields{ + clientSet: func() *fake.Clientset { + c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + prependEvictionReaction(t, c, false, false) + return c + }, + }, + args: args{ + cfg: drainNodeConfig{}, + action: newActionDrainNode(nodeName, nodeID, providerID, 1, false), + }, + wantErr: context.DeadlineExceeded, + wantErrorContains: "failed to drain via graceful eviction", + }, + { + name: "when eviction timeout is reached and force=false, leaves node cordoned and skip deletion", + fields: fields{ + clientSet: func() *fake.Clientset { + c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + prependEvictionReaction(t, c, false, true) + return c + }, + }, + args: args{ + cfg: drainNodeConfig{}, + action: newActionDrainNode(nodeName, nodeID, providerID, 0, false), + }, + wantErr: context.DeadlineExceeded, + wantErrorContains: "failed to drain via graceful eviction", + }, + { + name: "eviction fails and force=true, force remove pods: timeout during eviction", + fields: fields{ + clientSet: func() *fake.Clientset { + c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + prependEvictionReaction(t, c, false, true) + actualCalls := 0 + c.PrependReactor("delete", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { + deleteAction := action.(ktest.DeleteActionImpl) + if deleteAction.Name == podName { + actualCalls++ + // First call should be graceful; simulate it failed to validate we'll do the forced part. + // This relies on us not retrying 404s (or let's say it tests it :) ). + if deleteAction.DeleteOptions.GracePeriodSeconds == nil { + return true, nil, &apierrors.StatusError{ErrStatus: metav1.Status{Reason: metav1.StatusReasonNotFound}} + } + // Second call should be forced. + require.Equal(t, int64(0), *deleteAction.DeleteOptions.GracePeriodSeconds) + require.True(t, actualCalls <= 2, "actual calls to delete pod should be at most 2, got %d", actualCalls) + return false, nil, nil + } + return false, nil, nil + }) + return c + }, + }, + args: args{ + cfg: drainNodeConfig{ + podsDeleteTimeout: 700 * time.Millisecond, + podDeleteRetries: 5, + podDeleteRetryDelay: 500 * time.Millisecond, + podEvictRetryDelay: 500 * time.Millisecond, + podsTerminationWaitRetryDelay: 1000 * time.Millisecond, + }, + action: newActionDrainNode(nodeName, nodeID, providerID, 0, true), + }, + wantPodIsNotFound: true, + }, + { + name: "eviction fails and force=true, force remove pods: failed pod during eviction", + fields: fields{ + clientSet: func() *fake.Clientset { + c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + prependEvictionReaction(t, c, false, false) + actualCalls := 0 + c.PrependReactor("delete", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { + deleteAction := action.(ktest.DeleteActionImpl) + if deleteAction.Name == podName { + actualCalls++ + // First call should be graceful; simulate it failed to validate we'll do the forced part. + // This relies on us not retrying 404s (or let's say it tests it :) ). + if deleteAction.DeleteOptions.GracePeriodSeconds == nil { + return true, nil, &apierrors.StatusError{ErrStatus: metav1.Status{Reason: metav1.StatusReasonNotFound}} + } + // Second call should be forced. + require.Equal(t, int64(0), *deleteAction.DeleteOptions.GracePeriodSeconds) + require.True(t, actualCalls <= 2, "actual calls to delete pod should be at most 2, got %d", actualCalls) + return false, nil, nil + } + return false, nil, nil + }) + return c + }, + }, + args: args{ + cfg: drainNodeConfig{ + podsDeleteTimeout: 700 * time.Millisecond, + podDeleteRetries: 5, + podDeleteRetryDelay: 500 * time.Millisecond, + podEvictRetryDelay: 500 * time.Millisecond, + podsTerminationWaitRetryDelay: 1000 * time.Millisecond, + }, + action: newActionDrainNode(nodeName, nodeID, providerID, 10, true), + }, + wantPodIsNotFound: true, + }, + { + name: "eviction fails and force=true, at least one pod fails to delete due to internal error, should return error", + fields: fields{ + clientSet: func() *fake.Clientset { + c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + c.PrependReactor("delete", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { + deleteAction := action.(ktest.DeleteActionImpl) + if deleteAction.Name == podName { + return true, nil, &apierrors.StatusError{ErrStatus: metav1.Status{Reason: metav1.StatusReasonInternalError, Message: "internal"}} + } + return false, nil, nil + }) + return c + }, + }, + args: args{ + cfg: drainNodeConfig{ + podsDeleteTimeout: 7 * time.Second, + podDeleteRetries: 5, + podDeleteRetryDelay: 5 * time.Second, + podEvictRetryDelay: 5 * time.Second, + podsTerminationWaitRetryDelay: 10 * time.Second, + }, + action: newActionDrainNode(nodeName, nodeID, providerID, 0, true), + }, + wantErr: podFailedDeletionErr, + wantErrorContains: "pod default/pod1 failed deletion: deleting pod pod1 in namespace default: internal", + }, + { + name: "eviction fails and force=true, timeout during deletion should be retried and returned", + fields: fields{ + clientSet: func() *fake.Clientset { + c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + actualDeleteCalls := 0 + c.PrependReactor("delete", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { + deleteAction := action.(ktest.DeleteActionImpl) + if deleteAction.Name == podName { + actualDeleteCalls++ + return true, nil, &apierrors.StatusError{ErrStatus: metav1.Status{Reason: metav1.StatusReasonTooManyRequests, Message: "stop hammering"}} + } + return false, nil, nil + }) + return c + }, + }, + args: args{ + cfg: drainNodeConfig{ + podsDeleteTimeout: 0, // Force delete to timeout immediately. + podDeleteRetries: 5, + podDeleteRetryDelay: 5 * time.Second, + podEvictRetryDelay: 5 * time.Second, + podsTerminationWaitRetryDelay: 10 * time.Second, + }, + action: newActionDrainNode(nodeName, nodeID, providerID, 1, true), + }, + wantErr: context.DeadlineExceeded, + }, + { + name: "force=true, failed eviction for PDBs should be retried until timeout before deleting", + fields: fields{ + clientSet: func() *fake.Clientset { + c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) + c.PrependReactor("create", "pods", func(action ktest.Action) (handled bool, ret runtime.Object, err error) { + if action.GetSubresource() != "eviction" { + return false, nil, nil + } + + // PDB error is a bit specific in k8s to reconstruct... + return true, + nil, + &apierrors.StatusError{ErrStatus: metav1.Status{ + Reason: metav1.StatusReasonTooManyRequests, + Details: &metav1.StatusDetails{ + Causes: []metav1.StatusCause{ + { + Type: policyv1.DisruptionBudgetCause, + }, + }, + }, + }} + }) + return c + }, + }, + args: args{ + cfg: drainNodeConfig{}, + action: newActionDrainNode(nodeName, nodeID, providerID, 2, false), + }, + wantErr: context.DeadlineExceeded, + wantErrorContains: "failed to drain via graceful eviction", + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + h := &DrainNodeHandler{ + log: logrus.New(), + clientset: tt.fields.clientSet(), + cfg: tt.args.cfg, + } + err := h.Handle(context.Background(), tt.args.action) + require.Equal(t, tt.wantErr != nil, err != nil, "expected error: %v, got: %v", tt.wantErr, err) + if tt.wantErr != nil { + require.ErrorAs(t, err, &tt.wantErr) + require.ErrorContains(t, err, tt.wantErrorContains) + } + + if err != nil { + return + } + + n, err := h.clientset.CoreV1().Nodes().Get(context.Background(), tt.args.action.ActionDrainNode.NodeName, metav1.GetOptions{}) + require.True(t, (err != nil && apierrors.IsNotFound(err)) || + (err == nil && n.Spec.Unschedulable == !tt.wantNodeNotCordoned), + "expected node to be not found or cordoned, got: %v", err) + + _, err = h.clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) + require.True(t, (tt.wantPodIsNotFound && apierrors.IsNotFound(err)) || (!tt.wantPodIsNotFound && err == nil), "expected pod to be not found, got: %v", err) + + checkPods(t, h.clientset, "ds-pod", "static-pod", "job-pod") + }) + } +} + +func checkPods(t *testing.T, clientset kubernetes.Interface, podNames ...string) { + t.Helper() + for _, podName := range podNames { + _, err := clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) + require.NoError(t, err, "expected pod %s to be found", podName) + } +} + +func newActionDrainNode(nodeName, nodeID, providerID string, drainTimeoutSeconds int, force bool) *castai.ClusterAction { + return &castai.ClusterAction{ + ID: uuid.New().String(), + ActionDrainNode: &castai.ActionDrainNode{ + NodeName: nodeName, + NodeID: nodeID, + ProviderId: providerID, + DrainTimeoutSeconds: drainTimeoutSeconds, + Force: force, + }, + CreatedAt: time.Now().UTC(), + } +} diff --git a/internal/actions/evict_pod_handler_test.go b/internal/actions/evict_pod_handler_test.go index 3159be52..bc3cdda8 100644 --- a/internal/actions/evict_pod_handler_test.go +++ b/internal/actions/evict_pod_handler_test.go @@ -84,10 +84,12 @@ func TestEvictPodHandler(t *testing.T) { reaction := failingPodEvictionReaction(reactionErr, 2, normalPodEvictionReaction(t, ctx, clientset, 0)) prependPodEvictionReaction(clientset, reaction) - h := NewEvictPodHandler( - log, - clientset, - ) + h := &EvictPodHandler{ + log: log, + clientset: clientset, + podEvictRetryDelay: 5 * time.Millisecond, + podsTerminationWaitRetryDelay: 10 * time.Millisecond, + } action := newEvictPodAction(&castai.ActionEvictPod{ Namespace: pod.Namespace, diff --git a/internal/actions/kubernetes_helpers.go b/internal/actions/kubernetes_helpers.go index c75bb06e..b78163e4 100644 --- a/internal/actions/kubernetes_helpers.go +++ b/internal/actions/kubernetes_helpers.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "strings" "sync" "time" @@ -16,7 +17,9 @@ import ( "k8s.io/apimachinery/pkg/util/strategicpatch" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" + corev1 "k8s.io/client-go/kubernetes/typed/core/v1" + "github.com/castai/cluster-controller/internal/castai" "github.com/castai/cluster-controller/internal/waitext" ) @@ -85,37 +88,74 @@ func patchNodeStatus(ctx context.Context, log logrus.FieldLogger, clientset kube return nil } -func getNodeForPatching(ctx context.Context, log logrus.FieldLogger, clientset kubernetes.Interface, nodeName string) (*v1.Node, error) { - // on GKE we noticed that sometimes the node is not found, even though it is in the cluster - // as a result was returned from watch. But subsequent get request returns not found. - // This is likely due to clientset's caching that's meant to alleviate API's load. - // So we give enough time for cache to sync - ~10s max. +func getNodeByIDs(ctx context.Context, clientSet corev1.NodeInterface, nodeName, nodeID, providerID string, log logrus.FieldLogger) (*v1.Node, error) { + if nodeID == "" && providerID == "" { + return nil, fmt.Errorf("node and provider IDs are empty %w", errAction) + } - var node *v1.Node + n, err := clientSet.Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil && k8serrors.IsNotFound(err) { + return nil, errNodeNotFound + } + if err != nil { + return nil, err + } - boff := waitext.DefaultExponentialBackoff() - boff.Duration = 5 * time.Second + if n == nil { + return nil, errNodeNotFound + } - err := waitext.Retry( - ctx, - boff, - 5, - func(ctx context.Context) (bool, error) { - var err error - node, err = clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) - if err != nil { - return true, err + if err := isNodeIDProviderIDValid(n, nodeID, providerID, log); err != nil { + return nil, fmt.Errorf("requested node ID %s, provider ID %s for node name: %s %w", + nodeID, providerID, n.Name, err) + } + + return n, nil +} + +// isNodeIDProviderIDValid checks if the node's ID and provider ID match the requested ones. +func isNodeIDProviderIDValid(node *v1.Node, nodeID, providerID string, log logrus.FieldLogger) error { + if nodeID == "" && providerID == "" { + // if both node ID and provider ID are empty, we can't validate the node + return fmt.Errorf("node and provider IDs are empty %w", errAction) + } + emptyProviderID := providerID == "" || node.Spec.ProviderID == "" + + // validate provider id only if non-empty in request and in Node spec + // Azure provider: provider id can be empty even if node is Ready + validProviderID := !emptyProviderID && strings.EqualFold(node.Spec.ProviderID, providerID) + + if nodeID == "" && validProviderID { + // if node ID is not set in labels, but provider ID is valid, node is valid + return nil + } + + currentNodeID, ok := node.Labels[castai.LabelNodeID] + if ok && currentNodeID != "" { + if strings.EqualFold(currentNodeID, nodeID) { + if validProviderID { + // if node ID matches and provider ID is valid, node is valid + return nil } - return false, nil - }, - func(err error) { - log.Warnf("getting node, will retry: %v", err) - }, - ) - if err != nil { - return nil, err + if emptyProviderID { + // if node ID matches but provider ID is empty, node is valid + return nil + } + } + } + if (!ok || currentNodeID == "") && validProviderID { + // if node ID is not set in labels, but provider ID is valid, node is valid + return nil } - return node, nil + + if !emptyProviderID && node.Spec.ProviderID != providerID { + // if provider ID is not empty in request and does not match node's provider ID, log err for investigations + log.Errorf("node %v has provider ID %s, but requested provider ID is %s", node.Name, node.Spec.ProviderID, providerID) + } + + // if we reach here, it means that node ID and/or provider ID does not match + return fmt.Errorf("node %v has ID %s and provider ID %s: %w", + node.Name, currentNodeID, node.Spec.ProviderID, errNodeDoesNotMatch) } // executeBatchPodActions executes the action for each pod in the list. diff --git a/internal/actions/kubernetes_helpers_test.go b/internal/actions/kubernetes_helpers_test.go new file mode 100644 index 00000000..35c8ca94 --- /dev/null +++ b/internal/actions/kubernetes_helpers_test.go @@ -0,0 +1,565 @@ +package actions + +import ( + "context" + "fmt" + "testing" + + "github.com/golang/mock/gomock" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" + v1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + mock_actions "github.com/castai/cluster-controller/internal/actions/mock" + "github.com/castai/cluster-controller/internal/castai" +) + +func Test_isNodeIDProviderIDValid(t *testing.T) { + t.Parallel() + + type args struct { + node *v1.Node + nodeID string + providerID string + } + tests := []struct { + name string + args args + wantErr error + }{ + { + name: "empty node ID and provider id in request", + args: args{ + node: &v1.Node{}, + providerID: "", + nodeID: "", + }, + wantErr: errAction, + }, + { + name: "request node ID is empty but node id exists in node labels and provider ID matches", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + providerID: providerID, + nodeID: "", + }, + }, + { + name: "request and labels node ID are empty but provider ID matches", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: "", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + providerID: providerID, + nodeID: "", + }, + }, + { + name: "request node ID is empty and no labels but provider ID matches", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{}, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + providerID: providerID, + nodeID: "", + }, + }, + { + name: "node ID and provider ID are empty at Node spec", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{}, + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + }, + nodeID: nodeID, + providerID: providerID, + }, + wantErr: errNodeDoesNotMatch, + }, + { + name: "node ID is empty at Node spec and Provider is matching", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{}, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + nodeID: nodeID, + providerID: providerID, + }, + }, + { + name: "provider ID is empty at Node spec and NodeID is matching", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + }, + nodeID: nodeID, + providerID: providerID, + }, + }, + { + name: "node ID and provider ID matches", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + nodeID: nodeID, + providerID: providerID, + }, + }, + { + name: "node ID and provider ID matches - different casing", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: nodeID + "DIFFERENT_CASE", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID + "DIFFERENT_CASE", + }, + }, + nodeID: nodeID + "different_case", + providerID: providerID + "different_case", + }, + }, + { + name: "node ID does not match label but provider ID matches", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: "node-id-123-not-matching", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + nodeID: nodeID, + providerID: providerID, + }, + wantErr: errNodeDoesNotMatch, + }, + { + name: "node ID does not match label, provider ID empty", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: "node-id-123-not-matching", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + nodeID: nodeID, + providerID: "", + }, + wantErr: errNodeDoesNotMatch, + }, + { + name: "node ID and provider ID do not match", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: "node-id-123-not-matching", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "provider-id-456-not-matching", + }, + }, + nodeID: nodeID, + providerID: providerID, + }, + wantErr: errNodeDoesNotMatch, + }, + { + name: "node ID is match and provider ID does not match", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "provider-id-456-not-matching", + }, + }, + nodeID: nodeID, + providerID: providerID, + }, + wantErr: errNodeDoesNotMatch, + }, + { + name: "node ID is match and request provider ID is empty", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + nodeID: nodeID, + providerID: "", + }, + }, + { + name: "node ID is match and provider ID is empty in Node spec", + args: args{ + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + }, + nodeID: nodeID, + providerID: providerID, + }, + }, + } + for _, tt := range tests { + tt := tt // capture range variable + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := isNodeIDProviderIDValid(tt.args.node, tt.args.nodeID, tt.args.providerID, logrus.New()) + require.Equal(t, tt.wantErr != nil, got != nil, "error mismatch", got) + require.ErrorIs(t, got, tt.wantErr) + }) + } +} + +func Test_getNodeByIDs(t *testing.T) { + t.Parallel() + + errInternal := k8serrors.NewInternalError(fmt.Errorf("internal error")) + type args struct { + tuneNodeV1Interface func(m *mock_actions.MockNodeInterface) + nodeName string + nodeID string + providerID string + } + tests := []struct { + name string + args args + wantNode bool + wantErr error + }{ + { + name: "empty node and provider IDs", + wantErr: errAction, + }, + { + name: "node not found", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(nil, k8serrors.NewNotFound(v1.Resource("nodes"), nodeName)) + }, + }, + wantErr: errNodeNotFound, + }, + { + name: "not matching node ID", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + providerID: providerID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: "another-node-id", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, nil) + }, + }, + wantErr: errNodeDoesNotMatch, + }, + { + name: "not matching provider ID", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + providerID: providerID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "another-provider-id", + }, + }, nil) + }, + }, + wantErr: errNodeDoesNotMatch, + }, + { + name: "node id at request is empty but provider ID matches", + args: args{ + nodeName: nodeName, + nodeID: "", + providerID: providerID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, nil) + }, + }, + wantNode: true, + }, + { + name: "node id at label is empty but provider ID matches", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + providerID: providerID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{}, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, nil) + }, + }, + wantNode: true, + }, + { + name: "node id is empty at request and Node but provider ID matches", + args: args{ + nodeName: nodeName, + nodeID: "", + providerID: providerID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{}, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, nil) + }, + }, + wantNode: true, + }, + { + name: "provider id at Node object is empty but node ID matches", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + providerID: providerID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + }, nil) + }, + }, + wantNode: true, + }, + { + name: "provider id at request is empty but node ID matches", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + providerID: "", + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, nil) + }, + }, + wantNode: true, + }, + { + name: "provider id is empty at request and Node but node ID matches", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + providerID: "", + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + }, nil) + }, + }, + wantNode: true, + }, + { + name: "k8s node getter return error", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(nil, errInternal) + }, + }, + wantErr: errInternal, + }, + { + name: "node is nill", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(nil, nil) + }, + }, + wantErr: errNodeNotFound, + }, + { + name: "node found with matching IDs", + args: args{ + nodeName: nodeName, + nodeID: nodeID, + providerID: providerID, + tuneNodeV1Interface: func(m *mock_actions.MockNodeInterface) { + m.EXPECT().Get(gomock.Any(), nodeName, metav1.GetOptions{}). + Return(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, nil) + }, + }, + wantNode: true, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + ctrl := gomock.NewController(t) + clientSet := mock_actions.NewMockNodeInterface(ctrl) + if tt.args.tuneNodeV1Interface != nil { + tt.args.tuneNodeV1Interface(clientSet) + } + + got, err := getNodeByIDs(context.Background(), clientSet, tt.args.nodeName, tt.args.nodeID, tt.args.providerID, logrus.New()) + require.ErrorIs(t, err, tt.wantErr) + require.Equal(t, tt.wantNode, got != nil, "getNodeByIDs() does not expect node") + }) + } +} diff --git a/internal/actions/mock/corev1.go b/internal/actions/mock/corev1.go new file mode 100644 index 00000000..bd9859b1 --- /dev/null +++ b/internal/actions/mock/corev1.go @@ -0,0 +1,223 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: k8s.io/client-go/kubernetes/typed/core/v1 (interfaces: NodeInterface) + +// Package mock_actions is a generated GoMock package. +package mock_actions + +import ( + context "context" + reflect "reflect" + + gomock "github.com/golang/mock/gomock" + v1 "k8s.io/api/core/v1" + v10 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + v11 "k8s.io/client-go/applyconfigurations/core/v1" +) + +// MockNodeInterface is a mock of NodeInterface interface. +type MockNodeInterface struct { + ctrl *gomock.Controller + recorder *MockNodeInterfaceMockRecorder +} + +// MockNodeInterfaceMockRecorder is the mock recorder for MockNodeInterface. +type MockNodeInterfaceMockRecorder struct { + mock *MockNodeInterface +} + +// NewMockNodeInterface creates a new mock instance. +func NewMockNodeInterface(ctrl *gomock.Controller) *MockNodeInterface { + mock := &MockNodeInterface{ctrl: ctrl} + mock.recorder = &MockNodeInterfaceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockNodeInterface) EXPECT() *MockNodeInterfaceMockRecorder { + return m.recorder +} + +// Apply mocks base method. +func (m *MockNodeInterface) Apply(arg0 context.Context, arg1 *v11.NodeApplyConfiguration, arg2 v10.ApplyOptions) (*v1.Node, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Apply", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Node) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Apply indicates an expected call of Apply. +func (mr *MockNodeInterfaceMockRecorder) Apply(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Apply", reflect.TypeOf((*MockNodeInterface)(nil).Apply), arg0, arg1, arg2) +} + +// ApplyStatus mocks base method. +func (m *MockNodeInterface) ApplyStatus(arg0 context.Context, arg1 *v11.NodeApplyConfiguration, arg2 v10.ApplyOptions) (*v1.Node, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ApplyStatus", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Node) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ApplyStatus indicates an expected call of ApplyStatus. +func (mr *MockNodeInterfaceMockRecorder) ApplyStatus(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ApplyStatus", reflect.TypeOf((*MockNodeInterface)(nil).ApplyStatus), arg0, arg1, arg2) +} + +// Create mocks base method. +func (m *MockNodeInterface) Create(arg0 context.Context, arg1 *v1.Node, arg2 v10.CreateOptions) (*v1.Node, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Create", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Node) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Create indicates an expected call of Create. +func (mr *MockNodeInterfaceMockRecorder) Create(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Create", reflect.TypeOf((*MockNodeInterface)(nil).Create), arg0, arg1, arg2) +} + +// Delete mocks base method. +func (m *MockNodeInterface) Delete(arg0 context.Context, arg1 string, arg2 v10.DeleteOptions) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Delete", arg0, arg1, arg2) + ret0, _ := ret[0].(error) + return ret0 +} + +// Delete indicates an expected call of Delete. +func (mr *MockNodeInterfaceMockRecorder) Delete(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Delete", reflect.TypeOf((*MockNodeInterface)(nil).Delete), arg0, arg1, arg2) +} + +// DeleteCollection mocks base method. +func (m *MockNodeInterface) DeleteCollection(arg0 context.Context, arg1 v10.DeleteOptions, arg2 v10.ListOptions) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DeleteCollection", arg0, arg1, arg2) + ret0, _ := ret[0].(error) + return ret0 +} + +// DeleteCollection indicates an expected call of DeleteCollection. +func (mr *MockNodeInterfaceMockRecorder) DeleteCollection(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteCollection", reflect.TypeOf((*MockNodeInterface)(nil).DeleteCollection), arg0, arg1, arg2) +} + +// Get mocks base method. +func (m *MockNodeInterface) Get(arg0 context.Context, arg1 string, arg2 v10.GetOptions) (*v1.Node, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Get", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Node) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Get indicates an expected call of Get. +func (mr *MockNodeInterfaceMockRecorder) Get(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Get", reflect.TypeOf((*MockNodeInterface)(nil).Get), arg0, arg1, arg2) +} + +// List mocks base method. +func (m *MockNodeInterface) List(arg0 context.Context, arg1 v10.ListOptions) (*v1.NodeList, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "List", arg0, arg1) + ret0, _ := ret[0].(*v1.NodeList) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// List indicates an expected call of List. +func (mr *MockNodeInterfaceMockRecorder) List(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "List", reflect.TypeOf((*MockNodeInterface)(nil).List), arg0, arg1) +} + +// Patch mocks base method. +func (m *MockNodeInterface) Patch(arg0 context.Context, arg1 string, arg2 types.PatchType, arg3 []byte, arg4 v10.PatchOptions, arg5 ...string) (*v1.Node, error) { + m.ctrl.T.Helper() + varargs := []interface{}{arg0, arg1, arg2, arg3, arg4} + for _, a := range arg5 { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "Patch", varargs...) + ret0, _ := ret[0].(*v1.Node) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Patch indicates an expected call of Patch. +func (mr *MockNodeInterfaceMockRecorder) Patch(arg0, arg1, arg2, arg3, arg4 interface{}, arg5 ...interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]interface{}{arg0, arg1, arg2, arg3, arg4}, arg5...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Patch", reflect.TypeOf((*MockNodeInterface)(nil).Patch), varargs...) +} + +// PatchStatus mocks base method. +func (m *MockNodeInterface) PatchStatus(arg0 context.Context, arg1 string, arg2 []byte) (*v1.Node, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "PatchStatus", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Node) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// PatchStatus indicates an expected call of PatchStatus. +func (mr *MockNodeInterfaceMockRecorder) PatchStatus(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PatchStatus", reflect.TypeOf((*MockNodeInterface)(nil).PatchStatus), arg0, arg1, arg2) +} + +// Update mocks base method. +func (m *MockNodeInterface) Update(arg0 context.Context, arg1 *v1.Node, arg2 v10.UpdateOptions) (*v1.Node, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Update", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Node) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Update indicates an expected call of Update. +func (mr *MockNodeInterfaceMockRecorder) Update(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Update", reflect.TypeOf((*MockNodeInterface)(nil).Update), arg0, arg1, arg2) +} + +// UpdateStatus mocks base method. +func (m *MockNodeInterface) UpdateStatus(arg0 context.Context, arg1 *v1.Node, arg2 v10.UpdateOptions) (*v1.Node, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateStatus", arg0, arg1, arg2) + ret0, _ := ret[0].(*v1.Node) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// UpdateStatus indicates an expected call of UpdateStatus. +func (mr *MockNodeInterfaceMockRecorder) UpdateStatus(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateStatus", reflect.TypeOf((*MockNodeInterface)(nil).UpdateStatus), arg0, arg1, arg2) +} + +// Watch mocks base method. +func (m *MockNodeInterface) Watch(arg0 context.Context, arg1 v10.ListOptions) (watch.Interface, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Watch", arg0, arg1) + ret0, _ := ret[0].(watch.Interface) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Watch indicates an expected call of Watch. +func (mr *MockNodeInterfaceMockRecorder) Watch(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Watch", reflect.TypeOf((*MockNodeInterface)(nil).Watch), arg0, arg1) +} diff --git a/internal/actions/mock/kubernetes.go b/internal/actions/mock/kubernetes.go index 7004373f..bf0e2832 100644 --- a/internal/actions/mock/kubernetes.go +++ b/internal/actions/mock/kubernetes.go @@ -56,6 +56,7 @@ import ( v1beta113 "k8s.io/client-go/kubernetes/typed/rbac/v1beta1" v1alpha3 "k8s.io/client-go/kubernetes/typed/resource/v1alpha3" v1beta114 "k8s.io/client-go/kubernetes/typed/resource/v1beta1" + v1beta21 "k8s.io/client-go/kubernetes/typed/resource/v1beta2" v115 "k8s.io/client-go/kubernetes/typed/scheduling/v1" v1alpha16 "k8s.io/client-go/kubernetes/typed/scheduling/v1alpha1" v1beta115 "k8s.io/client-go/kubernetes/typed/scheduling/v1beta1" @@ -760,6 +761,20 @@ func (mr *MockInterfaceMockRecorder) ResourceV1beta1() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ResourceV1beta1", reflect.TypeOf((*MockInterface)(nil).ResourceV1beta1)) } +// ResourceV1beta2 mocks base method. +func (m *MockInterface) ResourceV1beta2() v1beta21.ResourceV1beta2Interface { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ResourceV1beta2") + ret0, _ := ret[0].(v1beta21.ResourceV1beta2Interface) + return ret0 +} + +// ResourceV1beta2 indicates an expected call of ResourceV1beta2. +func (mr *MockInterfaceMockRecorder) ResourceV1beta2() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ResourceV1beta2", reflect.TypeOf((*MockInterface)(nil).ResourceV1beta2)) +} + // SchedulingV1 mocks base method. func (m *MockInterface) SchedulingV1() v115.SchedulingV1Interface { m.ctrl.T.Helper() diff --git a/internal/actions/patch_node_handler.go b/internal/actions/patch_node_handler.go index e37c9d62..8cf1ce6d 100644 --- a/internal/actions/patch_node_handler.go +++ b/internal/actions/patch_node_handler.go @@ -3,33 +3,40 @@ package actions import ( "context" "encoding/json" + "errors" "fmt" "reflect" "strconv" + "time" "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/client-go/kubernetes" "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/waitext" ) var _ ActionHandler = &PatchNodeHandler{} func NewPatchNodeHandler(log logrus.FieldLogger, clientset kubernetes.Interface) *PatchNodeHandler { return &PatchNodeHandler{ - log: log, - clientset: clientset, + retryTimeout: 5 * time.Second, // default timeout for retrying node patching + log: log, + clientset: clientset, } } type PatchNodeHandler struct { - log logrus.FieldLogger - clientset kubernetes.Interface + retryTimeout time.Duration + log logrus.FieldLogger + clientset kubernetes.Interface } func (h *PatchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil || action.Data() == nil { + return fmt.Errorf("action or action data is nil %w", errAction) + } req, ok := action.Data().(*castai.ActionPatchNode) if !ok { return newUnexpectedTypeErr(action.Data(), req) @@ -53,13 +60,20 @@ func (h *PatchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct log := h.log.WithFields(logrus.Fields{ "node_name": req.NodeName, "node_id": req.NodeID, + "provider_id": req.ProviderId, "action": reflect.TypeOf(action.Data().(*castai.ActionPatchNode)).String(), ActionIDLogField: action.ID, }) - node, err := getNodeForPatching(ctx, h.log, h.clientset, req.NodeName) + log.Info("patching kubernetes node") + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) + } + + node, err := h.getNodeForPatching(ctx, req.NodeName, req.NodeID, req.ProviderId) if err != nil { - if apierrors.IsNotFound(err) { + if errors.Is(err, errNodeNotFound) { log.WithError(err).Infof("node not found, skipping patch") return nil } @@ -107,6 +121,39 @@ func (h *PatchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct return nil } +func (h *PatchNodeHandler) getNodeForPatching(ctx context.Context, nodeName, nodeID, providerID string) (*v1.Node, error) { + // on GKE we noticed that sometimes the node is not found, even though it is in the cluster + // as a result was returned from watch. But subsequent get request returns not found. + // This is likely due to clientset's caching that's meant to alleviate API's load. + // So we give enough time for cache to sync - ~10s max. + + var node *v1.Node + + boff := waitext.DefaultExponentialBackoff() + boff.Duration = h.retryTimeout + + err := waitext.Retry( + ctx, + boff, + 5, + func(ctx context.Context) (bool, error) { + var err error + node, err = getNodeByIDs(ctx, h.clientset.CoreV1().Nodes(), nodeName, nodeID, providerID, h.log) + if err != nil { + return true, err + } + return false, nil + }, + func(err error) { + h.log.Warnf("getting node, will retry: %v", err) + }, + ) + if err != nil { + return nil, err + } + return node, nil +} + func patchNodeMapField(values, patch map[string]string) map[string]string { if values == nil { values = map[string]string{} diff --git a/internal/actions/patch_node_handler_test.go b/internal/actions/patch_node_handler_test.go index 0c4d2a26..29d30941 100644 --- a/internal/actions/patch_node_handler_test.go +++ b/internal/actions/patch_node_handler_test.go @@ -3,6 +3,7 @@ package actions import ( "context" "testing" + "time" "github.com/google/uuid" "github.com/samber/lo" @@ -11,163 +12,282 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/fake" "github.com/castai/cluster-controller/internal/castai" ) -func TestPatchNodeHandler(t *testing.T) { - r := require.New(t) - - log := logrus.New() - log.SetLevel(logrus.DebugLevel) - - t.Run("patch successfully", func(t *testing.T) { - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Labels: map[string]string{ - "l1": "v1", - }, - Annotations: map[string]string{ - "a1": "v1", +func TestPatchNodeHandler_Handle(t *testing.T) { + t.Parallel() + type fields struct { + retryTimeout time.Duration + tuneFakeObjects []runtime.Object + } + type args struct { + action *castai.ClusterAction + } + tests := []struct { + name string + fields fields + args args + wantErr error + wantLabels map[string]string + wantAnnotations map[string]string + wantTaints []v1.Taint + wantCapacity v1.ResourceList + wantUnschedulable bool + }{ + { + name: "nil", + args: args{}, + wantErr: errAction, + }, + { + name: "wrong action type", + args: args{ + action: &castai.ClusterAction{ + ActionDeleteNode: &castai.ActionDeleteNode{}, }, }, - Spec: v1.NodeSpec{ - Taints: []v1.Taint{ - { - Key: "t1", - Value: "v1", - Effect: v1.TaintEffectNoSchedule, + wantErr: errAction, + }, + { + name: "labels contain entry with empty key", + args: args{ + action: newPatchNodeAction(nodeName, nodeID, providerID, + map[string]string{ + "": "v1", }, - { - Key: "t2", - Value: "v2", - Effect: v1.TaintEffectNoSchedule, + nil, nil, nil, nil), + }, + wantErr: errAction, + }, + { + name: "annotations contain entry with empty key", + args: args{ + action: newPatchNodeAction(nodeName, nodeID, providerID, + nil, + map[string]string{ + "": "v1", }, - }, + nil, nil, nil), }, - } - clientset := fake.NewSimpleClientset(node) - - h := PatchNodeHandler{ - log: log, - clientset: clientset, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionPatchNode: &castai.ActionPatchNode{ - NodeName: "node1", - Labels: map[string]string{ - "-l1": "", - "l2": "v2", + wantErr: errAction, + }, + { + name: "taints contain entry with empty key", + args: args{ + action: newPatchNodeAction(nodeName, nodeID, providerID, + nil, nil, + []castai.NodeTaint{ + { + Key: "", + Value: "v1", + }, + }, + nil, nil), + }, + wantErr: errAction, + }, + { + name: "empty node name", + args: args{ + action: newPatchNodeAction("", nodeID, providerID, + nil, nil, nil, nil, nil), + }, + wantErr: errAction, + }, + { + name: "empty node ID and provider ID", + args: args{ + action: newPatchNodeAction(nodeName, "", "", + nil, nil, nil, nil, nil), + }, + wantErr: errAction, + }, + { + name: "empty node ID and provider ID at Node", // for Azure legacy nodes it is real case, we consider it as error + fields: fields{ + retryTimeout: time.Millisecond, + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + }, }, - Annotations: map[string]string{ - "-a1": "", - "a2": "", + }, + args: args{ + action: newPatchNodeAction(nodeName, nodeID, providerID, + nil, nil, nil, nil, nil), + }, + wantErr: errNodeDoesNotMatch, + }, + { + name: "patch node successfully", + fields: fields{ + retryTimeout: time.Millisecond, + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + "l1": "v1", + }, + Annotations: map[string]string{ + "a1": "v1", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + Taints: []v1.Taint{ + { + Key: "t1", + Value: "v1", + Effect: v1.TaintEffectNoSchedule, + }, + { + Key: "t2", + Value: "v2", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }, }, - Taints: []castai.NodeTaint{ - { - Key: "t3", - Value: "t3", - Effect: string(v1.TaintEffectNoSchedule), + }, + args: args{ + action: newPatchNodeAction(nodeName, nodeID, providerID, + map[string]string{ + "-l1": "", + "l2": "v2", }, - { - Key: "-t2", - Value: "", - Effect: string(v1.TaintEffectNoSchedule), + map[string]string{ + "-a1": "", + "a2": "", + }, + []castai.NodeTaint{ + { + Key: "t3", + Value: "t3", + Effect: string(v1.TaintEffectNoSchedule), + }, + { + Key: "-t2", + Value: "", + Effect: string(v1.TaintEffectNoSchedule), + }, + }, + map[v1.ResourceName]resource.Quantity{ + "foo": resource.MustParse("123"), + }, + nil, + ), + }, + wantLabels: map[string]string{ + "l2": "v2", + }, + wantAnnotations: map[string]string{ + "a2": "", + }, + wantTaints: []v1.Taint{ + {Key: "t1", Value: "v1", Effect: "NoSchedule", TimeAdded: (*metav1.Time)(nil)}, + {Key: "t3", Value: "t3", Effect: "NoSchedule", TimeAdded: (*metav1.Time)(nil)}, + }, + wantCapacity: map[v1.ResourceName]resource.Quantity{ + "foo": resource.MustParse("123"), + }, + }, + { + name: "skip patch when node not found", + fields: fields{ + retryTimeout: time.Millisecond, + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, }, - }, - Capacity: map[v1.ResourceName]resource.Quantity{ - "foo": resource.MustParse("123"), }, }, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - - n, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - - expectedLabels := map[string]string{ - "l2": "v2", - } - r.Equal(expectedLabels, n.Labels) - - expectedAnnotations := map[string]string{ - "a2": "", - } - r.Equal(expectedAnnotations, n.Annotations) - - expectedTaints := []v1.Taint{ - {Key: "t1", Value: "v1", Effect: "NoSchedule", TimeAdded: (*metav1.Time)(nil)}, - {Key: "t3", Value: "t3", Effect: "NoSchedule", TimeAdded: (*metav1.Time)(nil)}, - } - r.Equal(expectedTaints, n.Spec.Taints) - - r.Equal(action.ActionPatchNode.Capacity["foo"], n.Status.Capacity["foo"]) - }) - - t.Run("skip patch when node not found", func(t *testing.T) { - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - }, - } - clientset := fake.NewSimpleClientset(node) - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionPatchNode: &castai.ActionPatchNode{ - NodeName: "already-deleted-node", - }, - } - h := PatchNodeHandler{ - log: log, - clientset: clientset, - } - - err := h.Handle(context.Background(), action) - r.NoError(err) - - _, err = clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - }) - - t.Run("cordoning node", func(t *testing.T) { - nodeName := "node1" - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, + args: args{ + action: newPatchNodeAction("notFoundNodeName", nodeID, providerID, + nil, nil, nil, nil, nil), + }, + }, + { + name: "cordoning node", + fields: fields{ + retryTimeout: time.Millisecond, + tuneFakeObjects: []runtime.Object{ + &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + Unschedulable: false, + }, + }, + }, }, - Spec: v1.NodeSpec{ - Unschedulable: false, + args: args{ + action: newPatchNodeAction(nodeName, nodeID, providerID, + nil, nil, nil, nil, lo.ToPtr(true)), }, - } - clientset := fake.NewSimpleClientset(node) - - h := PatchNodeHandler{ - log: log, - clientset: clientset, - } - - action := &castai.ClusterAction{ - ID: uuid.New().String(), - ActionPatchNode: &castai.ActionPatchNode{ - NodeName: "node1", - Unschedulable: lo.ToPtr(true), + wantLabels: map[string]string{ + castai.LabelNodeID: nodeID, }, - } + wantUnschedulable: true, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + clientSet := fake.NewClientset(tt.fields.tuneFakeObjects...) + h := &PatchNodeHandler{ + retryTimeout: tt.fields.retryTimeout, + log: logrus.New(), + clientset: clientSet, + } + err := h.Handle(context.Background(), tt.args.action) + require.Equal(t, tt.wantErr != nil, err != nil, "Handle() error = %v, wantErr %v", err, tt.wantErr) + if tt.wantErr != nil { + require.ErrorIs(t, err, tt.wantErr, "Handle() error mismatch") + } else { + n, err := clientSet.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + require.NoError(t, err) - err := h.Handle(context.Background(), action) - r.NoError(err) + require.Equal(t, tt.wantLabels, n.Labels, "labels mismatch") + require.Equal(t, tt.wantAnnotations, n.Annotations, "annotations mismatch") + require.Equal(t, tt.wantTaints, n.Spec.Taints, "taints mismatch") + require.Equal(t, tt.wantCapacity, n.Status.Capacity, "capacity mismatch") + require.Equal(t, tt.wantUnschedulable, n.Spec.Unschedulable, "unschedulable mismatch") + } + }) + } +} - n, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) - r.NoError(err) - r.True(n.Spec.Unschedulable) - }) +func newPatchNodeAction( + nodeName, nodeID, providerID string, + labels, annotations map[string]string, taints []castai.NodeTaint, capacity map[v1.ResourceName]resource.Quantity, + unschedulable *bool, +) *castai.ClusterAction { + return &castai.ClusterAction{ + ID: uuid.New().String(), + ActionPatchNode: &castai.ActionPatchNode{ + NodeName: nodeName, + NodeID: nodeID, + ProviderId: providerID, + Labels: labels, + Annotations: annotations, + Taints: taints, + Capacity: capacity, + Unschedulable: unschedulable, + }, + } } diff --git a/internal/actions/types.go b/internal/actions/types.go index bf731b52..1d60069f 100644 --- a/internal/actions/types.go +++ b/internal/actions/types.go @@ -1,5 +1,6 @@ //go:generate mockgen -destination ./mock/handler.go . ActionHandler //go:generate mockgen -package=mock_actions -destination ./mock/kubernetes.go k8s.io/client-go/kubernetes Interface +//go:generate mockgen -package=mock_actions -destination ./mock/corev1.go k8s.io/client-go/kubernetes/typed/core/v1 NodeInterface package actions @@ -17,7 +18,12 @@ const ( ActionIDLogField = "id" ) -var errAction = errors.New("not valid action") +var ( + errAction = errors.New("not valid action") + errNodeNotFound = errors.New("node not found") + errNodeDoesNotMatch = fmt.Errorf("node does not match") + errNodeWatcherClosed = fmt.Errorf("node watcher closed, no more events will be received") +) func newUnexpectedTypeErr(value, expectedType interface{}) error { return fmt.Errorf("unexpected type %T, expected %T %w", value, expectedType, errAction) diff --git a/internal/castai/types.go b/internal/castai/types.go index d54fe250..9edfc283 100644 --- a/internal/castai/types.go +++ b/internal/castai/types.go @@ -148,13 +148,15 @@ type ActionDelete struct { } type ActionDeleteNode struct { - NodeName string `json:"nodeName"` - NodeID string `json:"nodeId"` + NodeName string `json:"nodeName"` + NodeID string `json:"nodeId"` + ProviderId string `json:"providerId"` } type ActionDrainNode struct { NodeName string `json:"nodeName"` NodeID string `json:"nodeId"` + ProviderId string `json:"providerId"` DrainTimeoutSeconds int `json:"drainTimeoutSeconds"` Force bool `json:"force"` } @@ -168,6 +170,7 @@ type ActionEvictPod struct { type ActionPatchNode struct { NodeName string `json:"nodeName"` NodeID string `json:"nodeId"` + ProviderId string `json:"providerId"` Labels map[string]string `json:"labels"` Taints []NodeTaint `json:"taints"` Annotations map[string]string `json:"annotations"` @@ -196,8 +199,9 @@ type ActionCreateEvent struct { type ActionDisconnectCluster struct{} type ActionCheckNodeDeleted struct { - NodeName string `json:"nodeName"` - NodeID string `json:"nodeId"` + NodeName string `json:"nodeName"` + NodeID string `json:"nodeId"` + ProviderId string `json:"providerId"` } type ActionCheckNodeStatus_Status string @@ -210,6 +214,7 @@ const ( type ActionCheckNodeStatus struct { NodeName string `json:"nodeName"` NodeID string `json:"nodeId"` + ProviderId string `json:"providerId"` NodeStatus ActionCheckNodeStatus_Status `json:"nodeStatus,omitempty"` WaitTimeoutSeconds *int32 `json:"waitTimeoutSeconds,omitempty"` } diff --git a/loadtest/scenarios/check_node_deleted_stuck.go b/loadtest/scenarios/check_node_deleted_stuck.go index 3b752d64..67018b16 100644 --- a/loadtest/scenarios/check_node_deleted_stuck.go +++ b/loadtest/scenarios/check_node_deleted_stuck.go @@ -114,7 +114,8 @@ func (s *checkNodeDeletedStuckScenario) Run(ctx context.Context, _ string, _ kub ID: uuid.NewString(), CreatedAt: time.Now().UTC(), ActionCheckNodeDeleted: &castai.ActionCheckNodeDeleted{ - NodeName: node.Name, + NodeName: node.Name, + ProviderId: node.Spec.ProviderID, }, }) }