Skip to content

Commit 7c14b12

Browse files
andrewd-zededarene
authored andcommitted
eve-k: fix node delete if replicas list encounters crd get error
Fix LonghornReplicaList to be more resilient to failure modes. If the longhorn namespace exists pointing to an intact longhorn install, but the replica crd is missing, return no replicas and no error. This allows drain to be skipped and node delete or cluster delete to continue in a time efficient process. Fix Node Delete to be more resilient, defer node delete to run even in all error cases. Signed-off-by: Andrew Durbin <andrewd@zededa.com>
1 parent 82777e4 commit 7c14b12

File tree

2 files changed

+23
-7
lines changed

2 files changed

+23
-7
lines changed

pkg/pillar/cmd/zedkube/drain.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -355,8 +355,23 @@ func drainAndDeleteNode(ctx *zedkube) {
355355
log.Errorf("drainAndDeleteNode: can't get clientset %v", err)
356356
return
357357
}
358-
359358
nodeName := ctx.nodeName
359+
360+
nodeDeleteFn := func() {
361+
if nodeName == "" {
362+
log.Errorf("drainAndDeleteNode no nodename available for delete")
363+
return
364+
}
365+
if err := clientset.CoreV1().Nodes().Delete(context.Background(), nodeName, metav1.DeleteOptions{}); err != nil {
366+
log.Errorf("drainAndDeleteNode: clientset.CoreV1().Nodes().Delete failed: %v", err)
367+
return
368+
}
369+
log.Noticef("drainAndDeleteNode: node %s drained and deleted", nodeName)
370+
}
371+
// For cases where the node-in-deletion is not healthy, don't
372+
// leave a stale node object in the cluster, ensure its removed.
373+
defer nodeDeleteFn()
374+
360375
node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
361376
if err != nil {
362377
log.Errorf("drainAndDeleteNode: can't get nodes %v, for %s", err, nodeName)
@@ -414,11 +429,5 @@ func drainAndDeleteNode(ctx *zedkube) {
414429
return
415430
}
416431
}
417-
418-
if err := clientset.CoreV1().Nodes().Delete(context.Background(), nodeName, metav1.DeleteOptions{}); err != nil {
419-
log.Errorf("drainAndDeleteNode: clientset.CoreV1().Nodes().Delete failed: %v", err)
420-
return
421-
}
422-
log.Noticef("drainAndDeleteNode: node %s drained and deleted", nodeName)
423432
return
424433
}

pkg/pillar/kubeapi/longhorninfo.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/longhorn/longhorn-manager/k8s/pkg/client/clientset/versioned"
2020
appsv1 "k8s.io/api/apps/v1"
2121
corev1 "k8s.io/api/core/v1"
22+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2223
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2324
"k8s.io/client-go/kubernetes"
2425
)
@@ -396,6 +397,12 @@ func LonghornReplicaList(ownerNodeName string, longhornVolName string) (*lhv1bet
396397
LabelSelector: strings.Join(labelSelectors, ","),
397398
})
398399
if err != nil {
400+
// No replicas or no replicas crd is not a reason to error:
401+
// eg. the server could not find the requested resource (get replicas.longhorn.io)
402+
// return empty list instead of error
403+
if k8serrors.IsNotFound(err) {
404+
return &lhv1beta2.ReplicaList{}, nil
405+
}
399406
return nil, fmt.Errorf("LonghornReplicaList labelSelector:%s can't get replicas: %v", strings.Join(labelSelectors, ","), err)
400407
}
401408
return replicas, nil

0 commit comments

Comments
 (0)