Skip to content

Commit 79c8f47

Browse files
committed
feat: execute k8s and os plans sequentially node by node
Assemble two plans (k8s and os) to make them execute sequentially in a node-by-node fashion by manipulating node labels. Signed-off-by: Zespre Chang <[email protected]>
1 parent 248fb17 commit 79c8f47

File tree

10 files changed

+409
-103
lines changed

10 files changed

+409
-103
lines changed

api/v1beta1/upgradeplan_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,10 @@ type UpgradePlanSpec struct {
115115
// +kubebuilder:default:=automatic
116116
// +kubebuilder:validation:Enum:=automatic;interactive
117117
Mode *string `json:"mode,omitempty"`
118+
119+
// skipOSUpgrade indicates whether to skip the operating system upgrade. Default to "false" (OS will be upgraded).
120+
// +optional
121+
SkipOSUpgrade *bool `json:"skipOSUpgrade,omitempty"`
118122
}
119123

120124
// UpgradePlanStatus defines the observed state of UpgradePlan.

api/v1beta1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/management.harvesterhci.io_upgradeplans.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ spec:
7070
- automatic
7171
- interactive
7272
type: string
73+
skipOSUpgrade:
74+
description: skipOSUpgrade indicates whether to skip the operating
75+
system upgrade. Default to "false" (OS will be upgraded).
76+
type: boolean
7377
upgrade:
7478
description: |-
7579
upgrade can be specified to opt for any other specific upgrade image. If not provided, the version resource name is used.

config/rbac/role.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ rules:
1111
verbs:
1212
- get
1313
- list
14+
- patch
15+
- update
1416
- watch
1517
- apiGroups:
1618
- ""

internal/controller/job_controller.go

Lines changed: 141 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ type reconcileFuncs func(context.Context, *batchv1.Job) error
4848

4949
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch
5050
// +kubebuilder:rbac:groups=batch,resources=jobs/status,verbs=get
51+
// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;update;patch
5152
// +kubebuilder:rbac:groups=management.harvesterhci.io,resources=upgradeplans/status,verbs=get;update;patch
5253

5354
// Reconcile is part of the main kubernetes reconciliation loop which aims to
@@ -60,7 +61,7 @@ type reconcileFuncs func(context.Context, *batchv1.Job) error
6061
// For more details, check Reconcile and its Result here:
6162
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile
6263
func (r *JobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
63-
r.Log.V(1).Info("reconciling job")
64+
r.Log.V(2).Info("reconciling job")
6465

6566
var job batchv1.Job
6667
if err := r.Get(ctx, req.NamespacedName, &job); err != nil {
@@ -78,7 +79,10 @@ func (r *JobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
7879
return ctrl.Result{}, nil
7980
}
8081

81-
reconcilers := []reconcileFuncs{r.nodeUpgradeStatusUpdate}
82+
reconcilers := []reconcileFuncs{
83+
r.nodeUpgradeStatusUpdate,
84+
r.nodeLabelUpdate,
85+
}
8286

8387
for _, reconciler := range reconcilers {
8488
if err := reconciler(ctx, jobCopy); err != nil {
@@ -138,6 +142,55 @@ func (r *JobReconciler) nodeUpgradeStatusUpdate(ctx context.Context, job *batchv
138142
return nil
139143
}
140144

145+
func (r *JobReconciler) nodeLabelUpdate(ctx context.Context, job *batchv1.Job) error {
146+
r.Log.V(1).Info("node label update")
147+
148+
upgradePlanName, ok := job.Labels[upgradeplan.HarvesterUpgradePlanLabel]
149+
if !ok {
150+
return fmt.Errorf("label %s not found", upgradeplan.HarvesterUpgradePlanLabel)
151+
}
152+
upgradeComponent, ok := job.Labels[upgradeplan.HarvesterUpgradeComponentLabel]
153+
if !ok {
154+
return fmt.Errorf("label %s not found", upgradeplan.HarvesterUpgradeComponentLabel)
155+
}
156+
nodeName, ok := job.Labels[nodeLabel]
157+
if !ok {
158+
return fmt.Errorf("label %s not found", nodeLabel)
159+
}
160+
161+
if upgradeComponent != upgradeplan.NodeComponent {
162+
return nil
163+
}
164+
165+
finished, success := isJobFinished(job)
166+
167+
// Job still running or failed
168+
if !finished || !success {
169+
return nil
170+
}
171+
172+
var node corev1.Node
173+
if err := r.Get(ctx, types.NamespacedName{Name: nodeName}, &node); err != nil {
174+
return err
175+
}
176+
177+
var upgradePlan managementv1beta1.UpgradePlan
178+
if err := r.Get(ctx, types.NamespacedName{Name: upgradePlanName}, &upgradePlan); err != nil {
179+
return err
180+
}
181+
182+
// Nudge the node to the next upgrade state once the Job has finished successfully
183+
if err := updateNodeLabel(&node, &upgradePlan); err != nil {
184+
return err
185+
}
186+
187+
if err := r.Update(ctx, &node); err != nil {
188+
return err
189+
}
190+
191+
return nil
192+
}
193+
141194
func isHarvesterUpgradePlanJobs(job *batchv1.Job) bool {
142195
if job.Labels == nil {
143196
return false
@@ -158,43 +211,47 @@ func isHarvesterUpgradePlanJobs(job *batchv1.Job) bool {
158211
return true
159212
}
160213

161-
func defaultStateFor(component string) string {
162-
switch component {
163-
case upgradeplan.PrepareComponent:
214+
func defaultStateFor(component, t string) string {
215+
switch {
216+
case component == upgradeplan.PrepareComponent:
164217
return managementv1beta1.NodeStateImagePreloading
165-
case upgradeplan.NodeComponent:
218+
case component == upgradeplan.NodeComponent && t == upgradeplan.NodeUpgradeTypeKubernetes:
166219
return managementv1beta1.NodeStateKubernetesUpgrading
220+
case component == upgradeplan.NodeComponent && t == upgradeplan.NodeUpgradeTypeOS:
221+
return managementv1beta1.NodeStateOSUpgrading
167222
default:
168223
return ""
169224
}
170225
}
171226

172-
func successStateFor(component string) string {
173-
switch component {
174-
case upgradeplan.PrepareComponent:
227+
func successStateFor(component, t string) string {
228+
switch {
229+
case component == upgradeplan.PrepareComponent:
175230
return managementv1beta1.NodeStateImagePreloaded
176-
case upgradeplan.NodeComponent:
231+
case component == upgradeplan.NodeComponent && t == upgradeplan.NodeUpgradeTypeKubernetes:
177232
return managementv1beta1.NodeStateKubernetesUpgraded
233+
case component == upgradeplan.NodeComponent && t == upgradeplan.NodeUpgradeTypeOS:
234+
return managementv1beta1.NodeStateOSUpgraded
178235
default:
179236
return ""
180237
}
181238
}
182239

183-
func failureStateFor(component string) string {
184-
switch component {
185-
case upgradeplan.PrepareComponent:
240+
func failureStateFor(component, t string) string {
241+
switch {
242+
case component == upgradeplan.PrepareComponent:
186243
return managementv1beta1.NodeStateImagePreloadFailed
187-
case upgradeplan.NodeComponent:
244+
case component == upgradeplan.NodeComponent && t == upgradeplan.NodeUpgradeTypeKubernetes:
188245
return managementv1beta1.NodeStateKubernetesUpgradeFailed
246+
case component == upgradeplan.NodeComponent && t == upgradeplan.NodeUpgradeTypeOS:
247+
return managementv1beta1.NodeStateOSUpgradeFailed
189248
default:
190249
return ""
191250
}
192251
}
193252

194253
func buildNodeUpgradeStatus(job *batchv1.Job, upgradeComponent string) managementv1beta1.NodeUpgradeStatus {
195-
status := managementv1beta1.NodeUpgradeStatus{
196-
State: defaultStateFor(upgradeComponent),
197-
}
254+
jobType := job.Labels[upgradeplan.HarvesterNodeUpgradeTypeLabel]
198255

199256
for _, condition := range job.Status.Conditions {
200257
if condition.Status != corev1.ConditionTrue {
@@ -203,16 +260,80 @@ func buildNodeUpgradeStatus(job *batchv1.Job, upgradeComponent string) managemen
203260
switch condition.Type {
204261
case batchv1.JobComplete:
205262
return managementv1beta1.NodeUpgradeStatus{
206-
State: successStateFor(upgradeComponent),
263+
State: successStateFor(upgradeComponent, jobType),
207264
}
208265
case batchv1.JobFailed:
209266
return managementv1beta1.NodeUpgradeStatus{
210-
State: failureStateFor(upgradeComponent),
267+
State: failureStateFor(upgradeComponent, jobType),
211268
Reason: condition.Reason,
212269
Message: condition.Message,
213270
}
214271
}
215272
}
216273

217-
return status
274+
return managementv1beta1.NodeUpgradeStatus{
275+
State: defaultStateFor(upgradeComponent, jobType),
276+
}
277+
}
278+
279+
func isJobFinished(job *batchv1.Job) (finished, success bool) {
280+
for _, condition := range job.Status.Conditions {
281+
if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue {
282+
finished, success = true, true
283+
return
284+
}
285+
if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
286+
finished, success = true, false
287+
return
288+
}
289+
}
290+
return
291+
}
292+
293+
func updateNodeLabel(node *corev1.Node, upgradePlan *managementv1beta1.UpgradePlan) error {
294+
if node.Labels == nil {
295+
node.Labels = make(map[string]string)
296+
}
297+
298+
nodeUpgradeDesiredStateLabelKey := fmt.Sprintf("%s/%s", upgradeplan.LabelPrefix, upgradePlan.Name)
299+
desiredState, ok := node.Labels[nodeUpgradeDesiredStateLabelKey]
300+
if !ok {
301+
return nil
302+
}
303+
304+
switch desiredState {
305+
case upgradeplan.KubernetesUpgradeState:
306+
// If the desired upgrade state for the node is KubernetesUpgradeState and the node is already at
307+
// NodeStateKubernetesUpgraded, nudge the node to enter OSUpgradeState.
308+
currentStatus, ok := upgradePlan.Status.NodeUpgradeStatuses[node.Name]
309+
if !ok {
310+
return fmt.Errorf("node %s not found in the node upgrade statuses map", node.Name)
311+
}
312+
if currentStatus.State != managementv1beta1.NodeStateKubernetesUpgraded {
313+
return nil
314+
}
315+
316+
if upgradePlan.Spec.SkipOSUpgrade != nil && *upgradePlan.Spec.SkipOSUpgrade {
317+
delete(node.Labels, nodeUpgradeDesiredStateLabelKey)
318+
return nil
319+
}
320+
321+
node.Labels[nodeUpgradeDesiredStateLabelKey] = upgradeplan.OSUpgradeState
322+
case upgradeplan.OSUpgradeState:
323+
// If the desired upgrade state for the node is OSUpgradeState and the node is already at
324+
// NodeStateOSUpgraded, the node is considered fully upgraded.
325+
currentStatus, ok := upgradePlan.Status.NodeUpgradeStatuses[node.Name]
326+
if !ok {
327+
return fmt.Errorf("node %s not found in the node upgrade statuses map", node.Name)
328+
}
329+
if currentStatus.State != managementv1beta1.NodeStateOSUpgraded {
330+
return nil
331+
}
332+
333+
delete(node.Labels, nodeUpgradeDesiredStateLabelKey)
334+
default:
335+
return fmt.Errorf("unrecognized %s label value: %s", nodeUpgradeDesiredStateLabelKey, desiredState)
336+
}
337+
338+
return nil
218339
}

internal/controller/upgradeplan_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ type UpgradePlanReconciler struct {
6868
// For more details, check Reconcile and its Result here:
6969
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile
7070
func (r *UpgradePlanReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
71-
r.Log.V(1).Info("reconciling upgradeplan")
71+
r.Log.V(2).Info("reconciling upgradeplan")
7272

7373
var upgradePlan managementv1beta1.UpgradePlan
7474
if err := r.Get(ctx, req.NamespacedName, &upgradePlan); err != nil {

package/upgrade_manifests.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -984,6 +984,10 @@ pause_all_charts() {
984984
done
985985
}
986986

987+
disable_cluster_version_management() {
988+
kubectl annotate clusters.management local rancher.io/imported-cluster-version-management="false" --overwrite=true
989+
}
990+
987991
skip_restart_rancher_system_agent() {
988992
# to prevent rke2-server/agent from restarting during the rancher upgrade.
989993
# by adding an env var to temporarily make rancher-system-agent on each node skip restarting rke2-server/agent.
@@ -1243,6 +1247,7 @@ detect_repo
12431247
detect_upgrade
12441248
pre_upgrade_manifest
12451249
pause_all_charts
1250+
disable_cluster_version_management
12461251
skip_restart_rancher_system_agent
12471252
upgrade_rancher
12481253
patch_local_cluster_details

0 commit comments

Comments
 (0)