Skip to content

Commit bf96d59

Browse files
committed
update go mod
1 parent 2cf323b commit bf96d59

16 files changed

+340
-228
lines changed

internal/actions/check_node_deleted.go

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@ package actions
33
import (
44
"context"
55
"errors"
6+
"fmt"
67
"reflect"
78
"time"
89

910
"github.com/sirupsen/logrus"
11+
apierrors "k8s.io/apimachinery/pkg/api/errors"
12+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1013
"k8s.io/client-go/kubernetes"
1114

1215
"github.com/castai/cluster-controller/internal/castai"
@@ -49,7 +52,6 @@ func (h *CheckNodeDeletedHandler) Handle(ctx context.Context, action *castai.Clu
4952
"node_name": req.NodeName,
5053
"node_id": req.NodeID,
5154
"type": reflect.TypeOf(action.Data().(*castai.ActionCheckNodeDeleted)).String(),
52-
"provider_id": req.ProviderId,
5355
ActionIDLogField: action.ID,
5456
})
5557
log.Info("checking if node is deleted")
@@ -61,15 +63,33 @@ func (h *CheckNodeDeletedHandler) Handle(ctx context.Context, action *castai.Clu
6163
boff,
6264
h.cfg.retries,
6365
func(ctx context.Context) (bool, error) {
64-
n, err := getNodeByIDs(ctx, h.clientset, req.NodeName, req.NodeID, req.ProviderId)
65-
if n != nil {
66-
return false, errNodeNotDeleted
66+
n, err := h.clientset.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{})
67+
if apierrors.IsNotFound(err) {
68+
return false, nil
6769
}
6870

69-
if errors.Is(err, errNodeNotFound) {
71+
if n == nil {
7072
return false, nil
7173
}
7274

75+
currentNodeID, ok := n.Labels[castai.LabelNodeID]
76+
if !ok {
77+
log.Info("node doesn't have castai node id label")
78+
}
79+
if currentNodeID != "" {
80+
if currentNodeID != req.NodeID {
81+
log.Info("node name was reused. Original node is deleted")
82+
return false, nil
83+
}
84+
if currentNodeID == req.NodeID {
85+
return false, fmt.Errorf("current node id = request node ID %w", errNodeNotDeleted)
86+
}
87+
}
88+
89+
if n != nil {
90+
return false, errNodeNotDeleted
91+
}
92+
7393
return true, err
7494
},
7595
func(err error) {
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package actions
2+
3+
import (
4+
"context"
5+
"testing"
6+
7+
"github.com/google/uuid"
8+
"github.com/sirupsen/logrus"
9+
"github.com/stretchr/testify/require"
10+
v1 "k8s.io/api/core/v1"
11+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12+
"k8s.io/client-go/kubernetes/fake"
13+
14+
"github.com/castai/cluster-controller/internal/castai"
15+
)
16+
17+
//nolint:goconst
18+
func TestCheckNodeDeletedHandler(t *testing.T) {
19+
r := require.New(t)
20+
21+
log := logrus.New()
22+
log.SetLevel(logrus.DebugLevel)
23+
24+
t.Run("return error when node is not deleted", func(t *testing.T) {
25+
nodeName := "node1"
26+
node := &v1.Node{
27+
ObjectMeta: metav1.ObjectMeta{
28+
Name: nodeName,
29+
},
30+
}
31+
clientset := fake.NewSimpleClientset(node)
32+
33+
h := CheckNodeDeletedHandler{
34+
log: log,
35+
clientset: clientset,
36+
cfg: checkNodeDeletedConfig{},
37+
}
38+
39+
action := &castai.ClusterAction{
40+
ID: uuid.New().String(),
41+
ActionCheckNodeDeleted: &castai.ActionCheckNodeDeleted{NodeName: "node1"},
42+
}
43+
44+
err := h.Handle(context.Background(), action)
45+
r.EqualError(err, "node is not deleted")
46+
})
47+
48+
t.Run("handle check successfully when node is not found", func(t *testing.T) {
49+
clientset := fake.NewSimpleClientset()
50+
51+
h := CheckNodeDeletedHandler{
52+
log: log,
53+
clientset: clientset,
54+
cfg: checkNodeDeletedConfig{},
55+
}
56+
57+
action := &castai.ClusterAction{
58+
ID: uuid.New().String(),
59+
ActionCheckNodeDeleted: &castai.ActionCheckNodeDeleted{NodeName: "node1"},
60+
}
61+
62+
err := h.Handle(context.Background(), action)
63+
r.NoError(err)
64+
})
65+
}

internal/actions/check_node_status.go

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@ package actions
22

33
import (
44
"context"
5-
"errors"
65
"fmt"
76
"reflect"
87
"time"
98

109
"github.com/sirupsen/logrus"
1110
corev1 "k8s.io/api/core/v1"
11+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1212
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1313
"k8s.io/client-go/kubernetes"
1414

@@ -39,7 +39,6 @@ func (h *CheckNodeStatusHandler) Handle(ctx context.Context, action *castai.Clus
3939
log := h.log.WithFields(logrus.Fields{
4040
"node_name": req.NodeName,
4141
"node_id": req.NodeID,
42-
"provider_id": req.ProviderId,
4342
"node_status": req.NodeStatus,
4443
"type": reflect.TypeOf(action.Data().(*castai.ActionCheckNodeStatus)).String(),
4544
ActionIDLogField: action.ID,
@@ -72,28 +71,42 @@ func (h *CheckNodeStatusHandler) checkNodeDeleted(ctx context.Context, log *logr
7271
b,
7372
waitext.Forever,
7473
func(ctx context.Context) (bool, error) {
75-
n, err := getNodeByIDs(ctx, h.clientset, req.NodeName, req.NodeID, req.ProviderId)
76-
if n != nil {
77-
return false, errNodeNotDeleted
74+
n, err := h.clientset.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{})
75+
if apierrors.IsNotFound(err) {
76+
return false, nil
7877
}
7978

80-
if errors.Is(err, errNodeNotValid) {
81-
log.WithFields(map[string]interface{}{
82-
"node": req.NodeName,
83-
"node_id": req.NodeID,
84-
"provider_id": req.ProviderId,
85-
}).Warnf("node is not valid")
86-
return false, errNodeNotValid
87-
}
79+
// If node is nil - deleted
80+
// If label is present and doesn't match - node was reused - deleted
81+
// If label is present and matches - node is not deleted
82+
// If label is not present and node is not nil - node is not deleted (potentially corrupted state).
8883

89-
if errors.Is(err, errNodeNotFound) {
84+
if n == nil {
9085
return false, nil
9186
}
9287

88+
currentNodeID, ok := n.Labels[castai.LabelNodeID]
89+
if !ok {
90+
log.Info("node doesn't have castai node id label")
91+
}
92+
if currentNodeID != "" {
93+
if currentNodeID != req.NodeID {
94+
log.Info("node name was reused. Original node is deleted")
95+
return false, nil
96+
}
97+
if currentNodeID == req.NodeID {
98+
return false, fmt.Errorf("current node id is equal to requested node id: %v %w", req.NodeID, errNodeNotDeleted)
99+
}
100+
}
101+
102+
if n != nil {
103+
return false, errNodeNotDeleted
104+
}
105+
93106
return true, err
94107
},
95108
func(err error) {
96-
log.Warnf("check node %s status failed, will retry: %v", req.NodeName, err)
109+
h.log.Warnf("check node %s status failed, will retry: %v", req.NodeName, err)
97110
},
98111
)
99112
}
@@ -115,7 +128,7 @@ func (h *CheckNodeStatusHandler) checkNodeReady(ctx context.Context, _ *logrus.E
115128
defer watch.Stop()
116129
for r := range watch.ResultChan() {
117130
if node, ok := r.Object.(*corev1.Node); ok {
118-
if isNodeReady(node, req.NodeID, req.ProviderId) {
131+
if isNodeReady(node, req.NodeID) {
119132
return nil
120133
}
121134
}
@@ -124,11 +137,13 @@ func (h *CheckNodeStatusHandler) checkNodeReady(ctx context.Context, _ *logrus.E
124137
return fmt.Errorf("timeout waiting for node %s to become ready", req.NodeName)
125138
}
126139

127-
func isNodeReady(node *corev1.Node, castNodeID, providerID string) bool {
140+
func isNodeReady(node *corev1.Node, castNodeID string) bool {
128141
// if node has castai node id label, check if it matches the one we are waiting for
129142
// if it doesn't match, we can skip this node.
130-
if err := isNodeIDProviderIDValid(node, castNodeID, providerID); err != nil {
131-
return false
143+
if val, ok := node.Labels[castai.LabelNodeID]; ok {
144+
if val != "" && val != castNodeID {
145+
return false
146+
}
132147
}
133148
for _, cond := range node.Status.Conditions {
134149
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue && !containsUninitializedNodeTaint(node.Spec.Taints) {

internal/actions/check_node_status_test.go

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"github.com/castai/cluster-controller/internal/castai"
2020
)
2121

22-
// nolint:goconst
2322
func TestCheckStatus_Deleted(t *testing.T) {
2423
log := logrus.New()
2524
log.SetLevel(logrus.DebugLevel)
@@ -80,7 +79,7 @@ func TestCheckStatus_Deleted(t *testing.T) {
8079
}
8180

8281
err := h.Handle(context.Background(), action)
83-
r.EqualError(err, errNodeNotValid.Error())
82+
r.EqualError(err, "node is not deleted")
8483
})
8584

8685
t.Run("handle check successfully when node is not found", func(t *testing.T) {
@@ -132,7 +131,7 @@ func TestCheckStatus_Deleted(t *testing.T) {
132131
}
133132

134133
err := h.Handle(context.Background(), action)
135-
r.EqualError(err, errNodeNotValid.Error())
134+
r.NoError(err)
136135
})
137136
}
138137

@@ -177,12 +176,6 @@ func TestCheckStatus_Ready(t *testing.T) {
177176
node := &v1.Node{
178177
ObjectMeta: metav1.ObjectMeta{
179178
Name: nodeName,
180-
Labels: map[string]string{
181-
castai.LabelNodeID: "node1-id",
182-
},
183-
},
184-
Spec: v1.NodeSpec{
185-
ProviderID: "aws:///us-east-1",
186179
},
187180
Status: v1.NodeStatus{
188181
Conditions: []v1.NodeCondition{
@@ -205,8 +198,6 @@ func TestCheckStatus_Ready(t *testing.T) {
205198
ID: uuid.New().String(),
206199
ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{
207200
NodeName: "node1",
208-
NodeID: "node1-id",
209-
ProviderId: "aws:///us-east-1",
210201
NodeStatus: castai.ActionCheckNodeStatus_READY,
211202
WaitTimeoutSeconds: &timeout,
212203
},
@@ -247,8 +238,7 @@ func TestCheckStatus_Ready(t *testing.T) {
247238
},
248239
},
249240
Spec: v1.NodeSpec{
250-
Taints: []v1.Taint{taintCloudProviderUninitialized},
251-
ProviderID: "aws:///us-east-1",
241+
Taints: []v1.Taint{taintCloudProviderUninitialized},
252242
},
253243
}
254244
clientset := fake.NewClientset(node)
@@ -263,7 +253,6 @@ func TestCheckStatus_Ready(t *testing.T) {
263253
ID: uuid.New().String(),
264254
ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{
265255
NodeName: "node1",
266-
ProviderId: "aws:///us-east-1",
267256
NodeStatus: castai.ActionCheckNodeStatus_READY,
268257
WaitTimeoutSeconds: &timeout,
269258
},

internal/actions/csr/svc.go

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,15 @@ import (
2424
)
2525

2626
const (
27-
approveCSRTimeout = 4 * time.Minute
28-
groupSystemNodesName = "system:nodes"
27+
approveCSRTimeout = 4 * time.Minute
28+
groupSystemNodesName = "system:nodes"
29+
// kubeletBootstrapRequestingUser is the observed requestor for node's CSRs from real GKE clusters.
2930
kubeletBootstrapRequestingUser = "kubelet-bootstrap"
30-
clusterControllerSAName = "system:serviceaccount:castai-agent:castai-cluster-controller"
31-
approvedMessage = "This CSR was approved by CAST AI"
32-
csrOutdatedAfter = time.Hour
31+
// kubeletNodepoolBootstrapRequestingUser is the observed requestor for node's CSRs for GKE clusters 1.33+.
32+
kubeletNodepoolBootstrapRequestingUser = "kubelet-nodepool-bootstrap"
33+
clusterControllerSAName = "system:serviceaccount:castai-agent:castai-cluster-controller"
34+
approvedMessage = "This CSR was approved by CAST AI"
35+
csrOutdatedAfter = time.Hour
3336
)
3437

3538
func NewApprovalManager(log logrus.FieldLogger, clientset kubernetes.Interface) *ApprovalManager {
@@ -148,8 +151,8 @@ func (m *ApprovalManager) runAutoApproveForCastAINodes(ctx context.Context, c <-
148151
"signer": csr.SignerName(),
149152
"csr": csr.Name(),
150153
})
151-
if shouldSkip(log, csr) {
152-
log.Debug("skipping csr")
154+
if skip, reason := shouldSkip(csr); skip {
155+
log.Infof("skipping csr due to reason: %s", reason)
153156
return
154157
}
155158
log.Info("auto approving csr")
@@ -221,32 +224,26 @@ func (m *ApprovalManager) handle(ctx context.Context, log logrus.FieldLogger, cs
221224
return errors.New("certificate signing request was not approved")
222225
}
223226

224-
func shouldSkip(log logrus.FieldLogger, csr *wrapper.CSR) bool {
227+
func shouldSkip(csr *wrapper.CSR) (bool, string) {
225228
if csr.Approved() {
226-
log.Debug("csr already approved")
227-
return true
229+
return true, "csr already approved"
228230
}
229231
if time.Since(csr.CreatedAt()) > csrOutdatedAfter {
230-
log.Debug("csr is outdated")
231-
return true
232+
return true, fmt.Sprintf("csr is outdated, %v is larger than %v", time.Since(csr.CreatedAt()), csrOutdatedAfter)
232233
}
233234
if !managedSigner(csr.SignerName()) {
234-
log.Debug("csr unknown signer")
235-
return true
235+
return true, fmt.Sprintf("csr unknown signer: %s", csr.SignerName())
236236
}
237237
if !managedCSRNamePrefix(csr.Name()) {
238-
log.Debug("csr name not managed by CAST AI: ", csr.Name())
239-
return true
238+
return true, fmt.Sprintf("csr name not managed by CAST AI: %s", csr.Name())
240239
}
241240
if !managedCSRRequestingUser(csr.RequestingUser()) {
242-
log.Debug("csr requesting user is not managed by CAST AI")
243-
return true
241+
return true, fmt.Sprintf("csr requesting user is not managed by CAST AI: %s", csr.RequestingUser())
244242
}
245243
if !managerSubjectCommonName(csr.ParsedCertificateRequest().Subject.CommonName) {
246-
log.Debug("csr common name is not managed by CAST AI")
247-
return true
244+
return true, fmt.Sprintf("csr common name is not managed by CAST AI %s", csr.ParsedCertificateRequest().Subject.CommonName)
248245
}
249-
return false
246+
return false, ""
250247
}
251248

252249
func managedSigner(signerName string) bool {
@@ -259,7 +256,10 @@ func managedCSRNamePrefix(n string) bool {
259256
}
260257

261258
func managedCSRRequestingUser(s string) bool {
262-
return s == kubeletBootstrapRequestingUser || s == clusterControllerSAName || strings.HasPrefix(s, "system:node:")
259+
return s == kubeletBootstrapRequestingUser || // kubelet bootstrap user variation
260+
s == kubeletNodepoolBootstrapRequestingUser || // kubelet bootstrap user variation (observed initially on GKE 1.33+)
261+
s == clusterControllerSAName || // if created by CC
262+
strings.HasPrefix(s, "system:node:") // Post-bootstrap node user; usually for serving certificate
263263
}
264264

265265
func managerSubjectCommonName(commonName string) bool {

0 commit comments

Comments
 (0)