Skip to content

Commit f86723e

Browse files
Merge pull request #5789 from djoshy/delay-boot-image-install
OCPBUGS-79088: OCPBUGS-79357: Skip boot image updates until cluster is stable
2 parents f9aca3a + d917592 commit f86723e

File tree

3 files changed

+184
-1
lines changed

3 files changed

+184
-1
lines changed

pkg/controller/bootimage/boot_image_controller.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"strings"
88
"time"
99

10+
osconfigv1 "github.com/openshift/api/config/v1"
1011
features "github.com/openshift/api/features"
1112
opv1 "github.com/openshift/api/operator/v1"
1213
configinformersv1 "github.com/openshift/client-go/config/informers/externalversions/config/v1"
@@ -204,6 +205,10 @@ func New(
204205
DeleteFunc: ctrl.deleteMachineConfiguration,
205206
})
206207

208+
clusterVersionInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
209+
UpdateFunc: ctrl.updateClusterVersion,
210+
})
211+
207212
ctrl.fgHandler = fgHandler
208213

209214
ctrl.mapiBootImageState = map[string]BootImageState{}
@@ -498,6 +503,26 @@ func (ctrl *Controller) deleteMachineConfiguration(obj interface{}) {
498503
ctrl.enqueueEvent("BootImageUpdateConfigurationDeleted")
499504
}
500505

506+
// updateClusterVersion handles updates to the ClusterVersion object. It triggers
507+
// a reconciliation only when the cluster transitions to a stable state (i.e. the
508+
// most recent history entry flips from partial to completed), which signals that
509+
// an install or upgrade just finished.
510+
func (ctrl *Controller) updateClusterVersion(oldCV, newCV interface{}) {
511+
oldClusterVersion := oldCV.(*osconfigv1.ClusterVersion)
512+
newClusterVersion := newCV.(*osconfigv1.ClusterVersion)
513+
514+
oldHistory := oldClusterVersion.Status.History
515+
newHistory := newClusterVersion.Status.History
516+
517+
oldStable := len(oldHistory) > 0 && oldHistory[0].State == osconfigv1.CompletedUpdate
518+
newStable := len(newHistory) > 0 && newHistory[0].State == osconfigv1.CompletedUpdate
519+
520+
if !oldStable && newStable {
521+
klog.Infof("Cluster reached stable state (version %s), triggering boot image reconciliation", newHistory[0].Version)
522+
ctrl.enqueueEvent("ClusterVersionStable")
523+
}
524+
}
525+
501526
// updateConditions updates the boot image update conditions on the MachineConfiguration status
502527
// based on the current state of machine resource reconciliation.
503528
func (ctrl *Controller) updateConditions(newReason string, syncError error, targetConditionType string) {
@@ -663,6 +688,19 @@ func (ctrl *Controller) syncAll(event string) error {
663688
return err
664689
}
665690

691+
// Skip reconciliation while the cluster is installing or upgrading.
692+
// External services may not yet be reachable during these transitions
693+
// (e.g. vCenter on vSphere), and boot image updates are only meaningful
694+
// once the cluster has reached a stable state.
695+
stable, err := ctrl.isClusterStable()
696+
if err != nil {
697+
return fmt.Errorf("failed to check cluster stability: %w", err)
698+
}
699+
if !stable {
700+
klog.Infof("Cluster install or upgrade in progress, deferring boot image reconciliation")
701+
return nil
702+
}
703+
666704
ctrl.syncControlPlaneMachineSets(event)
667705
ctrl.syncMAPIMachineSets(event)
668706
return nil

pkg/controller/bootimage/boot_image_controller_test.go

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,143 @@ import (
77
"github.com/coreos/stream-metadata-go/stream/rhcos"
88
osconfigv1 "github.com/openshift/api/config/v1"
99
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
10+
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
1011
"github.com/stretchr/testify/assert"
1112
"github.com/stretchr/testify/require"
1213
corev1 "k8s.io/api/core/v1"
1314
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1415
"k8s.io/apimachinery/pkg/runtime"
1516
"k8s.io/client-go/kubernetes/fake"
17+
"k8s.io/client-go/tools/cache"
18+
"k8s.io/client-go/util/workqueue"
1619
)
1720

21+
func TestIsClusterStable(t *testing.T) {
22+
cases := []struct {
23+
name string
24+
history []osconfigv1.UpdateHistory
25+
expectStable bool
26+
}{
27+
{
28+
name: "empty history - install in progress",
29+
history: []osconfigv1.UpdateHistory{},
30+
expectStable: false,
31+
},
32+
{
33+
name: "only partial entry - install in progress",
34+
history: []osconfigv1.UpdateHistory{
35+
{State: osconfigv1.PartialUpdate, Version: "4.18.0"},
36+
},
37+
expectStable: false,
38+
},
39+
{
40+
name: "completed entry - stable after initial install",
41+
history: []osconfigv1.UpdateHistory{
42+
{State: osconfigv1.CompletedUpdate, Version: "4.18.0"},
43+
},
44+
expectStable: true,
45+
},
46+
{
47+
name: "upgrade in progress",
48+
history: []osconfigv1.UpdateHistory{
49+
{State: osconfigv1.PartialUpdate, Version: "4.19.0"},
50+
{State: osconfigv1.CompletedUpdate, Version: "4.18.0"},
51+
},
52+
expectStable: false,
53+
},
54+
{
55+
name: "upgrade complete - stable",
56+
history: []osconfigv1.UpdateHistory{
57+
{State: osconfigv1.CompletedUpdate, Version: "4.19.0"},
58+
{State: osconfigv1.CompletedUpdate, Version: "4.18.0"},
59+
},
60+
expectStable: true,
61+
},
62+
}
63+
64+
for _, tc := range cases {
65+
t.Run(tc.name, func(t *testing.T) {
66+
cv := &osconfigv1.ClusterVersion{
67+
ObjectMeta: v1.ObjectMeta{Name: "version"},
68+
Status: osconfigv1.ClusterVersionStatus{History: tc.history},
69+
}
70+
indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{})
71+
require.NoError(t, indexer.Add(cv))
72+
ctrl := &Controller{
73+
clusterVersionLister: configlistersv1.NewClusterVersionLister(indexer),
74+
}
75+
stable, err := ctrl.isClusterStable()
76+
require.NoError(t, err)
77+
assert.Equal(t, tc.expectStable, stable)
78+
})
79+
}
80+
}
81+
82+
func TestUpdateClusterVersion(t *testing.T) {
83+
cases := []struct {
84+
name string
85+
oldHistory []osconfigv1.UpdateHistory
86+
newHistory []osconfigv1.UpdateHistory
87+
expectEnqueue bool
88+
}{
89+
{
90+
name: "partial to completed - install finishes",
91+
oldHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.PartialUpdate, Version: "4.18.0"}},
92+
newHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.CompletedUpdate, Version: "4.18.0"}},
93+
expectEnqueue: true,
94+
},
95+
{
96+
// not sure this is possible, added for safety
97+
name: "empty to completed - install finishes from scratch",
98+
oldHistory: []osconfigv1.UpdateHistory{},
99+
newHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.CompletedUpdate, Version: "4.18.0"}},
100+
expectEnqueue: true,
101+
},
102+
{
103+
name: "upgrade partial to completed - upgrade finishes",
104+
oldHistory: []osconfigv1.UpdateHistory{
105+
{State: osconfigv1.PartialUpdate, Version: "4.19.0"},
106+
{State: osconfigv1.CompletedUpdate, Version: "4.18.0"},
107+
},
108+
newHistory: []osconfigv1.UpdateHistory{
109+
{State: osconfigv1.CompletedUpdate, Version: "4.19.0"},
110+
{State: osconfigv1.CompletedUpdate, Version: "4.18.0"},
111+
},
112+
expectEnqueue: true,
113+
},
114+
{
115+
name: "completed to completed - no change in stability",
116+
oldHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.CompletedUpdate, Version: "4.18.0"}},
117+
newHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.CompletedUpdate, Version: "4.18.0"}},
118+
expectEnqueue: false,
119+
},
120+
{
121+
name: "partial to partial - still in progress",
122+
oldHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.PartialUpdate, Version: "4.18.0"}},
123+
newHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.PartialUpdate, Version: "4.18.0"}},
124+
expectEnqueue: false,
125+
},
126+
{
127+
name: "completed to partial - new upgrade started",
128+
oldHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.CompletedUpdate, Version: "4.18.0"}},
129+
newHistory: []osconfigv1.UpdateHistory{{State: osconfigv1.PartialUpdate, Version: "4.19.0"}, {State: osconfigv1.CompletedUpdate, Version: "4.18.0"}},
130+
expectEnqueue: false,
131+
},
132+
}
133+
134+
for _, tc := range cases {
135+
t.Run(tc.name, func(t *testing.T) {
136+
ctrl := &Controller{
137+
queue: workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[string]()),
138+
}
139+
oldCV := &osconfigv1.ClusterVersion{Status: osconfigv1.ClusterVersionStatus{History: tc.oldHistory}}
140+
newCV := &osconfigv1.ClusterVersion{Status: osconfigv1.ClusterVersionStatus{History: tc.newHistory}}
141+
ctrl.updateClusterVersion(oldCV, newCV)
142+
assert.Equal(t, tc.expectEnqueue, ctrl.queue.Len() > 0)
143+
})
144+
}
145+
}
146+
18147
func TestHotLoop(t *testing.T) {
19148
cases := []struct {
20149
name string
@@ -411,7 +540,7 @@ func TestReconcileAzureProviderSpec(t *testing.T) {
411540
expectedImage machinev1beta1.Image
412541
expectPatch bool
413542
expectSkip bool
414-
streamData *stream.Stream // Custom stream data for specific tests
543+
streamData *stream.Stream // Custom stream data for specific tests
415544
securityProfile *machinev1beta1.SecurityProfile // Custom security profile for specific tests
416545
}{
417546
{

pkg/controller/bootimage/helpers.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"strings"
88
"time"
99

10+
osconfigv1 "github.com/openshift/api/config/v1"
1011
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
1112
opv1 "github.com/openshift/api/operator/v1"
1213

@@ -131,6 +132,21 @@ func upgradeStubIgnitionIfRequired(secretName string, secretClient clientset.Int
131132
return nil
132133
}
133134

135+
// isClusterStable returns true if the cluster is in a stable state, meaning
136+
// the most recent ClusterVersion history entry is a completed update. This
137+
// returns false during the initial installation (no completed entry yet) and
138+
// during upgrades (most recent entry is partial).
139+
func (ctrl *Controller) isClusterStable() (bool, error) {
140+
clusterVersion, err := ctrl.clusterVersionLister.Get("version")
141+
if err != nil {
142+
return false, fmt.Errorf("failed to fetch clusterversion: %w", err)
143+
}
144+
if len(clusterVersion.Status.History) == 0 {
145+
return false, nil
146+
}
147+
return clusterVersion.Status.History[0].State == osconfigv1.CompletedUpdate, nil
148+
}
149+
134150
// waitForMachineConfigurationReady waits for the MachineConfiguration to be ready
135151
// by polling until the status is populated and the ObservedGeneration matches Generation.
136152
func (ctrl *Controller) waitForMachineConfigurationReady() error {

0 commit comments

Comments
 (0)