Skip to content

Commit 728897a

Browse files
committed
fix: wait for infra machine info to be collected before powering off
There were cases when an infra machine was accepted, it was powered off by the infra provider too quickly, before its specs were populated by the `MachineStatus` poller. This caused additional issues as some other resources like `SchematicConfiguration` were also never created, blocking cluster creation. Address this by setting the preferred power state of the infra machine to ON until its status is populated (we check this by checking the secure boot status field). Additionally, clean up `InfraMachineConfig` resources (user-managed) when a machine is deleted. Signed-off-by: Utku Ozdemir <[email protected]>
1 parent 1c4f9af commit 728897a

File tree

3 files changed

+44
-6
lines changed

3 files changed

+44
-6
lines changed

internal/backend/runtime/omni/controllers/omni/infra_machine.go

+21-3
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ func NewInfraMachineController() *InfraMachineController {
6464
qtransform.WithExtraMappedInput(
6565
qtransform.MapperSameID[*omni.ClusterMachine, *siderolink.Link](),
6666
),
67+
qtransform.WithExtraMappedInput(
68+
qtransform.MapperSameID[*omni.MachineStatus, *siderolink.Link](),
69+
),
6770
qtransform.WithExtraMappedInput(
6871
func(ctx context.Context, _ *zap.Logger, runtime controller.QRuntime, res *infra.ProviderStatus) ([]resource.Pointer, error) {
6972
linkList, err := safe.ReaderListAll[*siderolink.Link](ctx, runtime, state.WithLabelQuery(resource.LabelEqual(omni.LabelInfraProviderID, res.Metadata().ID())))
@@ -94,6 +97,11 @@ func (h *infraMachineControllerHelper) transformExtraOutput(ctx context.Context,
9497
return err
9598
}
9699

100+
machineStatus, err := helpers.HandleInput[*omni.MachineStatus](ctx, r, InfraMachineControllerName, link)
101+
if err != nil {
102+
return err
103+
}
104+
97105
providerID, ok := link.Metadata().Annotations().Get(omni.LabelInfraProviderID)
98106
if !ok {
99107
return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("the link is not created by an infra provider")
@@ -108,7 +116,9 @@ func (h *infraMachineControllerHelper) transformExtraOutput(ctx context.Context,
108116
return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("the link is not created by a static infra provider")
109117
}
110118

111-
if err = h.applyInfraMachineConfig(infraMachine, config); err != nil {
119+
machineInfoCollected := machineStatus != nil && machineStatus.TypedSpec().Value.SecureBootStatus != nil
120+
121+
if err = h.applyInfraMachineConfig(infraMachine, config, machineInfoCollected); err != nil {
112122
return err
113123
}
114124

@@ -168,13 +178,17 @@ func (h *infraMachineControllerHelper) finalizerRemovalExtraOutput(ctx context.C
168178
return err
169179
}
170180

171-
_, err := helpers.HandleInput[*omni.ClusterMachine](ctx, r, InfraMachineControllerName, link)
181+
if _, err := helpers.HandleInput[*omni.ClusterMachine](ctx, r, InfraMachineControllerName, link); err != nil {
182+
return err
183+
}
184+
185+
_, err := helpers.HandleInput[*omni.MachineStatus](ctx, r, InfraMachineControllerName, link)
172186

173187
return err
174188
}
175189

176190
// applyInfraMachineConfig applies the user-managed configuration from the omni.InfraMachineConfig resource into the infra.Machine.
177-
func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *infra.Machine, config *omni.InfraMachineConfig) error {
191+
func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *infra.Machine, config *omni.InfraMachineConfig, machineInfoCollected bool) error {
178192
const defaultPreferredPowerState = specs.InfraMachineSpec_POWER_STATE_OFF // todo: introduce a resource to configure this globally or per-provider level
179193

180194
// reset the user-override fields except the "Accepted" field
@@ -209,6 +223,10 @@ func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *inf
209223
infraMachine.Metadata().Labels().Delete(omni.LabelMachinePendingAccept)
210224
}
211225

226+
if !machineInfoCollected { // we need the machine to stay powered on even if it is accepted, until Omni collects the machine information
227+
infraMachine.TypedSpec().Value.PreferredPowerState = specs.InfraMachineSpec_POWER_STATE_ON
228+
}
229+
212230
return nil
213231
}
214232

internal/backend/runtime/omni/controllers/omni/infra_machine_test.go

+18-1
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,26 @@ func (suite *InfraMachineControllerSuite) TestReconcile() {
4747
assertion.True(ok)
4848
assertion.Equal("bare-metal", infraProviderID)
4949

50-
assertion.Equal(specs.InfraMachineSpec_POWER_STATE_OFF, r.TypedSpec().Value.PreferredPowerState)
50+
assertion.Equal(specs.InfraMachineSpec_POWER_STATE_ON, r.TypedSpec().Value.PreferredPowerState) // MachineStatus is not populated yet
5151
assertion.Equal(specs.InfraMachineConfigSpec_PENDING, r.TypedSpec().Value.AcceptanceStatus)
5252
assertion.Empty(r.TypedSpec().Value.ClusterTalosVersion)
5353
assertion.Empty(r.TypedSpec().Value.Extensions)
5454
assertion.Empty(r.TypedSpec().Value.WipeId)
5555
})
5656

57+
machineStatus := omni.NewMachineStatus(resources.DefaultNamespace, "machine-1")
58+
machineStatus.TypedSpec().Value.SecureBootStatus = &specs.SecureBootStatus{}
59+
60+
suite.Require().NoError(suite.state.Create(suite.ctx, machineStatus))
61+
62+
assertResource[*omni.MachineStatus](&suite.OmniSuite, machineStatus.Metadata(), func(r *omni.MachineStatus, assertion *assert.Assertions) {
63+
assertion.True(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName))
64+
})
65+
66+
assertResource[*infra.Machine](&suite.OmniSuite, infraMachineMD, func(r *infra.Machine, assertion *assert.Assertions) {
67+
assertion.Equal(specs.InfraMachineSpec_POWER_STATE_OFF, r.TypedSpec().Value.PreferredPowerState) // expect the default state of "OFF"
68+
})
69+
5770
// accept the machine, set its preferred power state to on
5871
config := omni.NewInfraMachineConfig(resources.DefaultNamespace, "machine-1")
5972
config.TypedSpec().Value.AcceptanceStatus = specs.InfraMachineConfigSpec_ACCEPTED
@@ -148,6 +161,10 @@ func (suite *InfraMachineControllerSuite) TestReconcile() {
148161
assertion.False(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName))
149162
})
150163

164+
assertResource[*omni.MachineStatus](&suite.OmniSuite, infraMachineMD, func(r *omni.MachineStatus, assertion *assert.Assertions) {
165+
assertion.False(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName))
166+
})
167+
151168
// assert that infra.Machine is removed
152169
assertNoResource[*infra.Machine](&suite.OmniSuite, infraMachine)
153170
}

internal/backend/runtime/omni/controllers/omni/machine_cleanup.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@ type MachineCleanupController = cleanup.Controller[*omni.Machine]
2121
func NewMachineCleanupController() *MachineCleanupController {
2222
return cleanup.NewController(
2323
cleanup.Settings[*omni.Machine]{
24-
Name: "MachineCleanupController",
25-
Handler: &helpers.SameIDHandler[*omni.Machine, *omni.MachineSetNode]{},
24+
Name: "MachineCleanupController",
25+
Handler: cleanup.Combine(
26+
&helpers.SameIDHandler[*omni.Machine, *omni.MachineSetNode]{},
27+
&helpers.SameIDHandler[*omni.Machine, *omni.InfraMachineConfig]{},
28+
),
2629
},
2730
)
2831
}

0 commit comments

Comments
 (0)