@@ -140,16 +140,61 @@ func riversOps(c *cke.Cluster, nf *NodeFilter, maxConcurrentUpdates int) (ops []
140140 return ops
141141}
142142
143- func k8sOps (c * cke.Cluster , nf * NodeFilter , cs * cke.ClusterStatus , maxConcurrentUpdates int ) (ops []cke.Operator ) {
144- // For cp nodes
145- if nodes := nf .SSHConnected (nf .APIServerStopped (nf .ControlPlaneNodes ())); len (nodes ) > 0 {
143+ func apiserverOps (c * cke.Cluster , nf * NodeFilter , cs * cke.ClusterStatus , maxConcurrentUpdates int ) (ops []cke.Operator , skipOtherOps bool ) {
144+ // First, do the following operations together.
145+ // - Starting stopped kube-apiservers. (for bootstrapping the Kubernetes cluster or for rebooting controle plane nodes)
146+ // - Updating outdated and unhealhy apiservers. (for repairing configuration errors)
147+ var nodes []* cke.Node
148+ nodes = append (nodes , nf .SSHConnected (nf .APIServerStopped (nf .ControlPlaneNodes ()))... )
149+ nodes = append (nodes , nf .SSHConnected (nf .APIServerOutdated (nf .APIServerUnhealthy (nf .ControlPlaneNodes ())))... )
150+ if len (nodes ) > 0 {
151+ // Set the unhealthy kube-apiservers to NotReady.
152+ // NOTE: This step should be skipped during bootstraping.
153+ if nf .HealthyAPIServer () != nil {
154+ ops = append (ops , masterEndpointOps (c , cs , nf , nil )... )
155+ }
146156 kubeletConfig := k8s .GenerateKubeletConfiguration (c .Options .Kubelet , "0.0.0.0" , nil )
147157 ops = append (ops , k8s .APIServerRestartOp (nodes , c .ServiceSubnet , c .Options .APIServer , kubeletConfig .ClusterDomain ))
148158 }
159+ if len (ops ) > 0 {
160+ return ops , true
161+ }
162+
163+ // Waiting for all kube-apiservers to be healthy. Because...
164+ // - Updating kube-apiservers should be performed only when all kube-apiservers are healthy.
165+ // - Other k8s components should be maintained only when all kube-apiservers are *updated* and healthy, to ensure the upgrade order.
166+ // See the version skew policy: https://kubernetes.io/releases/version-skew-policy/
167+ if len (nf .APIServerUnhealthy (nf .ControlPlaneNodes ())) > 0 {
168+ // Set the unhealthy kube-apiservers to NotReady.
169+ if nf .HealthyAPIServer () != nil {
170+ ops = append (ops , masterEndpointOps (c , cs , nf , nil )... )
171+ }
172+ return ops , true
173+ }
174+
175+ // Updating kube-apiservers one by one.
149176 if nodes := nf .SSHConnected (nf .APIServerOutdated (nf .ControlPlaneNodes ())); len (nodes ) > 0 {
177+ target := nodes [0 ] // just one
178+ ops = append (ops , masterEndpointOps (c , cs , nf , []string {target .Address })... )
150179 kubeletConfig := k8s .GenerateKubeletConfiguration (c .Options .Kubelet , "0.0.0.0" , nil )
151- ops = append (ops , k8s .APIServerRestartOp (nodes , c .ServiceSubnet , c .Options .APIServer , kubeletConfig .ClusterDomain ))
180+ ops = append (ops , k8s .APIServerRestartOp ([]* cke.Node {target }, c .ServiceSubnet , c .Options .APIServer , kubeletConfig .ClusterDomain ))
181+ return ops , true
152182 }
183+
184+ // Set kube-apiservers to Ready.
185+ // If a control plane node is about to reboot, set it to NotReady.
186+ ops = append (ops , masterEndpointOps (c , cs , nf , rebootNextCandidates (cs ))... )
187+ return ops , false
188+ }
189+
190+ func k8sOps (c * cke.Cluster , nf * NodeFilter , cs * cke.ClusterStatus , maxConcurrentUpdates int ) (ops []cke.Operator ) {
191+ apiserverOps , skipOtherOps := apiserverOps (c , nf , cs , maxConcurrentUpdates )
192+ if skipOtherOps {
193+ return apiserverOps
194+ }
195+ ops = append (ops , apiserverOps ... )
196+
197+ // Other CP components
153198 if nodes := nf .SSHConnected (nf .ControllerManagerStopped (nf .ControlPlaneNodes ())); len (nodes ) > 0 {
154199 ops = append (ops , k8s .ControllerManagerBootOp (nodes , c .Name , c .ServiceSubnet , c .Options .ControllerManager ))
155200 }
@@ -266,9 +311,7 @@ func k8sMaintOps(c *cke.Cluster, cs *cke.ClusterStatus, resources []cke.Resource
266311
267312 ops = append (ops , decideNodeDNSOps (apiServer , c , ks )... )
268313
269- ops = append (ops , masterEndpointOps (c , cs , nf )... )
270-
271- ops = append (ops , etcdEndpointOps (c , cs , nf )... )
314+ ops = append (ops , etcdEndpointOps (c , cs , nf , rebootNextCandidates (cs ))... )
272315
273316 if nodes := nf .OutdatedAttrsNodes (); len (nodes ) > 0 {
274317 ops = append (ops , op .KubeNodeUpdateOp (apiServer , nodes ))
@@ -347,17 +390,17 @@ type endpointParams struct {
347390 serviceName string
348391}
349392
350- func masterEndpointOps (c * cke.Cluster , cs * cke.ClusterStatus , nf * NodeFilter ) []cke.Operator {
393+ func masterEndpointOps (c * cke.Cluster , cs * cke.ClusterStatus , nf * NodeFilter , markedAsNotReadyIPs [] string ) []cke.Operator {
351394 var readyIPs , notReadyIPs []string
352395
353- for _ , n := range nf .HealthyAPIServerNodes ( ) {
354- if cs . RebootQueue . Enabled && ( rebootProcessing (cs , n .Address ) || rebootNextCandidate ( cs , n .Address ) ) {
396+ for _ , n := range nf .APIServerHealthy ( nf . ControlPlaneNodes () ) {
397+ if rebootProcessing (cs , n .Address ) || slices . Contains ( markedAsNotReadyIPs , n .Address ) {
355398 notReadyIPs = append (notReadyIPs , n .Address )
356399 } else {
357400 readyIPs = append (readyIPs , n .Address )
358401 }
359402 }
360- for _ , n := range nf .UnhealthyAPIServerNodes ( ) {
403+ for _ , n := range nf .APIServerUnhealthy ( nf . ControlPlaneNodes () ) {
361404 notReadyIPs = append (notReadyIPs , n .Address )
362405 }
363406
@@ -373,7 +416,7 @@ func masterEndpointOps(c *cke.Cluster, cs *cke.ClusterStatus, nf *NodeFilter) []
373416 return decideEpEpsOps (ep , cs .Kubernetes .MasterEndpoints , cs .Kubernetes .MasterEndpointSlice , nf .HealthyAPIServer ())
374417}
375418
376- func etcdEndpointOps (c * cke.Cluster , cs * cke.ClusterStatus , nf * NodeFilter ) (ops []cke.Operator ) {
419+ func etcdEndpointOps (c * cke.Cluster , cs * cke.ClusterStatus , nf * NodeFilter , markedAsNotReadyIPs [] string ) (ops []cke.Operator ) {
377420 // Endpoints needs a corresponding Service.
378421 // If an Endpoints lacks such a Service, it will be removed.
379422 // https://github.com/kubernetes/kubernetes/blob/b7c2d923ef4e166b9572d3aa09ca72231b59b28b/pkg/controller/endpoint/endpoints_controller.go#L392-L397
@@ -384,7 +427,7 @@ func etcdEndpointOps(c *cke.Cluster, cs *cke.ClusterStatus, nf *NodeFilter) (ops
384427
385428 var readyIPs , notReadyIPs []string
386429 for _ , n := range nf .ControlPlaneNodes () {
387- if cs . RebootQueue . Enabled && ( rebootProcessing (cs , n .Address ) || rebootNextCandidate ( cs , n .Address ) ) {
430+ if rebootProcessing (cs , n .Address ) || slices . Contains ( markedAsNotReadyIPs , n .Address ) {
388431 notReadyIPs = append (notReadyIPs , n .Address )
389432 } else {
390433 readyIPs = append (readyIPs , n .Address )
@@ -759,7 +802,7 @@ func repairOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain
759802
760803 rebootingApiServers := make (map [string ]bool )
761804 for _ , cp := range nf .ControlPlaneNodes () {
762- if cs . RebootQueue . Enabled && rebootProcessing (cs , cp .Nodename ()) {
805+ if rebootProcessing (cs , cp .Nodename ()) {
763806 rebootingApiServers [cp .Address ] = true
764807 }
765808 }
@@ -911,7 +954,7 @@ func rebootUncordonOp(cs *cke.ClusterStatus, nf *NodeFilter) cke.Operator {
911954 }
912955 nodes := make ([]string , 0 , len (attrNodes ))
913956 for _ , n := range attrNodes {
914- if ( cs . RebootQueue . Enabled && rebootProcessing (cs , n .Name ) ) || repairProcessing (cs , n .Name ) {
957+ if rebootProcessing (cs , n .Name ) || repairProcessing (cs , n .Name ) {
915958 continue
916959 }
917960 nodes = append (nodes , n .Name )
@@ -922,8 +965,23 @@ func rebootUncordonOp(cs *cke.ClusterStatus, nf *NodeFilter) cke.Operator {
922965 return op .RebootUncordonOp (nf .HealthyAPIServer (), nodes )
923966}
924967
925- // NOTE: This function does not check whether the reboot queue is enabled or not.
968+ func rebootNextCandidates (cs * cke.ClusterStatus ) []string {
969+ if ! cs .RebootQueue .Enabled {
970+ return nil
971+ }
972+
973+ ret := []string {}
974+ for _ , entry := range cs .RebootQueue .NextCandidates {
975+ ret = append (ret , entry .Node )
976+ }
977+ return ret
978+ }
979+
926980func rebootProcessing (cs * cke.ClusterStatus , node string ) bool {
981+ if ! cs .RebootQueue .Enabled {
982+ return false
983+ }
984+
927985 for _ , entry := range cs .RebootQueue .Entries {
928986 if entry .Node != node {
929987 continue
@@ -938,16 +996,6 @@ func rebootProcessing(cs *cke.ClusterStatus, node string) bool {
938996 return false
939997}
940998
941- // NOTE: This function does not check whether the reboot queue is enabled or not.
942- func rebootNextCandidate (cs * cke.ClusterStatus , node string ) bool {
943- for _ , entry := range cs .RebootQueue .NextCandidates {
944- if entry .Node == node {
945- return true
946- }
947- }
948- return false
949- }
950-
951999func repairProcessing (cs * cke.ClusterStatus , nodename string ) bool {
9521000 for _ , entry := range cs .RepairQueue .Entries {
9531001 if entry .IsInCluster () && entry .Nodename == nodename &&
0 commit comments