Skip to content

Commit 55c073c

Browse files
Merge pull request #3133 from jacob-anders/fix-healthy-condition
🐛 Fix Healthy condition staying Unknown despite Ironic reporting health status
2 parents 5a65e4a + 065ea20 commit 55c073c

File tree

5 files changed

+66
-18
lines changed

5 files changed

+66
-18
lines changed

internal/controller/metal3.io/baremetalhost_controller.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,16 +240,21 @@ func (r *BareMetalHostReconciler) Reconcile(ctx context.Context, request ctrl.Re
240240
return result, err
241241
}
242242

243+
// Always compute conditions since some (e.g. Healthy) can change
244+
// based on external state from Ironic regardless of state machine
245+
// transitions.
246+
conditionsBefore := slices.Clone(host.GetConditions())
247+
computeConditions(ctx, host, prov)
248+
conditionsChanged := !reflect.DeepEqual(conditionsBefore, host.GetConditions())
249+
243250
// Only save status when we're told to, otherwise we
244251
// introduce an infinite loop reconciling the same object over and
245252
// over when there is an unrecoverable error (tracked through the
246253
// error state of the host).
247-
if actResult.Dirty() {
248-
// Save Host
254+
if actResult.Dirty() || conditionsChanged {
249255
info.log.Info("saving host status",
250256
"operational status", host.OperationalStatus(),
251257
"provisioning state", host.Status.Provisioning.State)
252-
computeConditions(ctx, host, prov)
253258
err = r.saveHostStatus(ctx, host)
254259
if err != nil {
255260
return ctrl.Result{}, fmt.Errorf("failed to save host status after %q: %w", initialState, err)
@@ -2301,7 +2306,11 @@ func computeConditions(ctx context.Context, host *metal3api.BareMetalHost, prov
23012306
setConditionUnknown(host, metal3api.HealthyCondition, metal3api.UnknownHealthReason)
23022307
return
23032308
}
2304-
switch prov.GetHealth(ctx) {
2309+
switch health := prov.GetHealth(ctx); health {
2310+
case "":
2311+
if meta.FindStatusCondition(host.Status.Conditions, string(metal3api.HealthyCondition)) == nil {
2312+
setConditionUnknown(host, metal3api.HealthyCondition, metal3api.UnknownHealthReason)
2313+
}
23052314
case provisioner.HealthOK:
23062315
setConditionTrue(host, metal3api.HealthyCondition, metal3api.HealthyReason)
23072316
case provisioner.HealthWarning:

internal/controller/metal3.io/baremetalhost_controller_test.go

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3187,12 +3187,6 @@ func TestComputeHealthyCondition(t *testing.T) {
31873187
ExpectedStatus metav1.ConditionStatus
31883188
ExpectedReason string
31893189
}{
3190-
{
3191-
Scenario: "empty health reports unknown",
3192-
Health: "",
3193-
ExpectedStatus: metav1.ConditionUnknown,
3194-
ExpectedReason: metal3api.UnknownHealthReason,
3195-
},
31963190
{
31973191
Scenario: "OK health",
31983192
Health: provisioner.HealthOK,
@@ -3234,6 +3228,27 @@ func TestComputeHealthyCondition(t *testing.T) {
32343228
}
32353229
}
32363230

3231+
func TestComputeHealthyConditionEmptyHealth(t *testing.T) {
3232+
host := bmhWithStatus(metal3api.OperationalStatusOK, metal3api.StateAvailable)
3233+
fix := &fixture.Fixture{Health: ""}
3234+
prov, err := fix.NewProvisioner(t.Context(), provisioner.BuildHostData(*host, bmc.Credentials{}), nil)
3235+
require.NoError(t, err)
3236+
3237+
// On a fresh host with no existing condition, empty health should
3238+
// initialise the condition to Unknown.
3239+
computeConditions(t.Context(), host, prov)
3240+
cond := conditions.Get(host, metal3api.HealthyCondition)
3241+
require.NotNil(t, cond, "empty health on new host should set Unknown condition")
3242+
assert.Equal(t, metav1.ConditionUnknown, cond.Status)
3243+
3244+
// Set a real health value, then verify empty health preserves it.
3245+
setConditionTrue(host, metal3api.HealthyCondition, metal3api.HealthyReason)
3246+
computeConditions(t.Context(), host, prov)
3247+
cond = conditions.Get(host, metal3api.HealthyCondition)
3248+
require.NotNil(t, cond)
3249+
assert.Equal(t, metav1.ConditionTrue, cond.Status, "empty health should not overwrite existing condition")
3250+
}
3251+
32373252
func TestComputeConditions(t *testing.T) {
32383253
fix := fixture.Fixture{PowerFailed: true}
32393254
provisionerWithPowerFailure, err := fix.NewProvisioner(t.Context(), provisioner.HostData{}, nil)
@@ -3332,7 +3347,7 @@ func TestComputeConditions(t *testing.T) {
33323347
}
33333348
cond := conditions.Get(tc.BareMetalHost, metal3api.HealthyCondition)
33343349
require.NotNil(t, cond, "Healthy condition should always be set")
3335-
assert.Equal(t, metav1.ConditionUnknown, cond.Status, "Healthy should be Unknown when health is not reported")
3350+
assert.Equal(t, metav1.ConditionUnknown, cond.Status, "Healthy should be Unknown when no health data is available")
33363351
})
33373352
}
33383353
}

pkg/provisioner/ironic/clients/features.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ func (af AvailableFeatures) Log(logger logr.Logger) {
4545
"maxVersion", fmt.Sprintf("1.%d", af.MaxVersion),
4646
"chosenVersion", af.ChooseMicroversion(),
4747
"virtualMediaGET", af.HasVirtualMediaGetAPI(),
48-
"disablePowerOff", af.HasDisablePowerOff())
48+
"disablePowerOff", af.HasDisablePowerOff(),
49+
"healthAPI", af.HasHealthAPI())
4950
}
5051

5152
func (af AvailableFeatures) HasVirtualMediaGetAPI() bool {
@@ -56,7 +57,15 @@ func (af AvailableFeatures) HasDisablePowerOff() bool {
5657
return af.MaxVersion >= 95 //nolint:mnd
5758
}
5859

60+
func (af AvailableFeatures) HasHealthAPI() bool {
61+
return af.MaxVersion >= 109 //nolint:mnd
62+
}
63+
5964
func (af AvailableFeatures) ChooseMicroversion() string {
65+
if af.HasHealthAPI() {
66+
return "1.109"
67+
}
68+
6069
if af.HasDisablePowerOff() {
6170
return "1.95"
6271
}

pkg/provisioner/ironic/clients/features_test.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
)
77

88
func TestAvailableFeatures_ChooseMicroversion(t *testing.T) {
9-
microVersion := "1.95"
109
type fields struct {
1110
MaxVersion int
1211
}
@@ -23,18 +22,32 @@ func TestAvailableFeatures_ChooseMicroversion(t *testing.T) {
2322
want: baselineVersionString,
2423
},
2524
{
26-
name: fmt.Sprintf("MaxVersion = %d return %s", 89, microVersion),
25+
name: "MaxVersion = 95 return 1.95",
2726
feature: fields{
2827
MaxVersion: 95,
2928
},
30-
want: microVersion,
29+
want: "1.95",
3130
},
3231
{
33-
name: fmt.Sprintf("MaxVersion > %d return %s", 89, microVersion),
32+
name: "MaxVersion = 100 return 1.95",
3433
feature: fields{
3534
MaxVersion: 100,
3635
},
37-
want: microVersion,
36+
want: "1.95",
37+
},
38+
{
39+
name: "MaxVersion = 109 return 1.109",
40+
feature: fields{
41+
MaxVersion: 109,
42+
},
43+
want: "1.109",
44+
},
45+
{
46+
name: "MaxVersion > 109 return 1.109",
47+
feature: fields{
48+
MaxVersion: 115,
49+
},
50+
want: "1.109",
3851
},
3952
}
4053
for _, tt := range tests {

pkg/provisioner/ironic/ironic.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1997,7 +1997,9 @@ func (p *ironicProvisioner) HasPowerFailure(ctx context.Context) bool {
19971997
func (p *ironicProvisioner) GetHealth(ctx context.Context) string {
19981998
node, err := p.getNode(ctx)
19991999
if err != nil {
2000-
p.log.Error(err, "ignored error while checking health status")
2000+
if !errors.Is(err, provisioner.ErrNeedsRegistration) {
2001+
p.log.Error(err, "ignored error while checking health status")
2002+
}
20012003
return ""
20022004
}
20032005
return node.Health

0 commit comments

Comments
 (0)