Open
Description
What steps did you take and what happened:
cluster-api-provider-elf(CAPE) is the infrastructure provider of cluster-api (CAPI)
During deleting the CAPI Machine that without providerID, and ElfMachine set providerID successful at the same time (kubernetes worker node is up). CAPI removed CAPI Machine and ElfMachine directly, did not delete the associated kubernetes node because CAPI Machine has not synced ElfMachine's providerID value.
// cluster-api/internal/controllers/machine/machine_controller.go
func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (ctrl.Result, error) {
err := r.isDeleteNodeAllowed(ctx, cluster, m)
isDeleteNodeAllowed := err == nil
if err != nil {
switch err {
case errNoControlPlaneNodes, errLastControlPlaneNode, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted:
var nodeName = ""
if m.Status.NodeRef != nil {
nodeName = m.Status.NodeRef.Name
}
log.Info("Deleting Kubernetes Node associated with Machine is not allowed", "Node", klog.KRef("", nodeName), "cause", err.Error())
default:
return ctrl.Result{}, errors.Wrapf(err, "failed to check if Kubernetes Node deletion is allowed")
}
}
...
// This code will not be executed due to errNilNodeRef
if isDeleteNodeAllowed {
log.Info("Deleting node", "Node", klog.KRef("", m.Status.NodeRef.Name))
var deleteNodeErr error
waitErr := wait.PollImmediate(2*time.Second, r.nodeDeletionRetryTimeout, func() (bool, error) {
if deleteNodeErr = r.deleteNode(ctx, cluster, m.Status.NodeRef.Name); deleteNodeErr != nil && !apierrors.IsNotFound(errors.Cause(deleteNodeErr)) {
return false, nil
}
return true, nil
})
}
}
What did you expect to happen:
CAPI should delete the kubernetes nodes when ElfMachine with providerID but CAPI Machine without providerID
logs
# CAPE logs
I1014 03:21:11.555773 1 elfmachine_controller.go:639] cape-controller-manager/elfmachine-controller "msg"="Set node providerID success" "elfCluster"="mycluster" "elfMachine"="mycluster-worker1-49wkt" "namespace"="default" "cluster"="mycluster" "node"="mycluster-worker1-49wkt" "providerID"="elf://165d2fb5-2b7a-477c-a752-c777581738c5"
I1014 03:21:11.591151 1 elfmachine_controller.go:306] cape-controller-manager/elfmachine-controller "msg"="Reconciling ElfMachine delete" "elfCluster"="mycluster" "elfMachine"="mycluster-worker1-49wkt" "namespace"="default"
E1014 03:21:39.672186 1 elfmachine_controller.go:209] cape-controller-manager/elfmachine-controller "msg"="patch failed" "error"="elfmachines.infrastructure.cluster.x-k8s.io \"mycluster-worker1-49wkt\" not found" "elfCluster"="mycluster" "namespace"="default" "elfMachine"="infrastructure.cluster.x-k8s.io/v1beta1, Kind=ElfMachine default/mycluster-worker1-49wkt"
E1014 03:21:39.672277 1 controller.go:326] "msg"="Reconciler error" "error"="elfmachines.infrastructure.cluster.x-k8s.io \"mycluster-worker1-49wkt\" not found" "controller"="elfmachine" "controllerGroup"="infrastructure.cluster.x-k8s.io" "controllerKind"="ElfMachine" "elfMachine"={"name":"mycluster-worker1-49wkt","namespace":"default"} "name"="mycluster-worker1-49wkt" "namespace"="default" "reconcileID"="8a21bd13-f4ce-4fab-a893-649d6d672020"
# CAPI logs
I1014 03:20:48.780918 1 machine_controller_noderef.go:49] "Cannot reconcile Machine's Node, no valid ProviderID yet" controller="machine" controllerGroup="cluster.x-k8s.io" controllerKind="Machine" name="mycluster-worker1-5cbdd99959-d4jnw" reconcileID=b97e47a2-58ae-41cb-8f5c-a1bf135af532 machine="mycluster-worker1-5cbdd99959-d4jnw" namespace="default" cluster="mycluster"
I1014 03:21:11.478341 1 machineset_controller.go:460] "Deleted machine" controller="machineset" controllerGroup="cluster.x-k8s.io" controllerKind="MachineSet" machineSet="default/mycluster-worker1-5cbdd99959" namespace="default" name="mycluster-worker1-5cbdd99959" reconcileID=8d787ed2-6db0-4c48-be9d-ed35b7243258 machine="mycluster-worker1-5cbdd99959-d4jnw"
I1014 03:21:11.479102 1 machine_controller.go:296] "Deleting Kubernetes Node associated with Machine is not allowed" controller="machine" controllerGroup="cluster.x-k8s.io" controllerKind="Machine" machine="default/mycluster-worker1-5cbdd99959-d4jnw" namespace="default" name="mycluster-worker1-5cbdd99959-d4jnw" reconcileID=052c63c1-8fe9-4444-b204-fd608cc5a2a7 cluster="mycluster" node="nil" cause="noderef is nil"
E1014 03:21:39.755615 1 controller.go:326] "Reconciler error" err="machines.cluster.x-k8s.io \"mycluster-worker1-5cbdd99959-d4jnw\" not found" controller="machine" controllerGroup="cluster.x-k8s.io" controllerKind="Machine" machine="default/mycluster-worker1-5cbdd99959-d4jnw" namespace="default" name="mycluster-worker1-5cbdd99959-d4jnw" reconcileID=6ceb8fa1-94ca-4cba-923c-35164d71e8d6
mycluster-worker1-49wkt should be deleted
[root@mycluster-control-plane-p2v2g ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
mycluster-control-plane-24twx Ready control-plane 3d19h v1.24.0
mycluster-control-plane-8hzgz Ready control-plane 3d19h v1.24.0
mycluster-control-plane-fjqnz Ready control-plane 3d18h v1.24.0
mycluster-worker1-49wkt NotReady <none> 3d3h v1.24.0
mycluster-worker1-hn5vq Ready <none> 3d3h v1.24.0
Environment:
- Cluster-api version: v1.2.2
- minikube/kind version: v0.14.0
- Kubernetes version: v1.24.0
- OS: CentOS7
/kind bug