Skip to content

Commit e79a4e7

Browse files
CrystalChundtantsur
andcommitted
Allow forced detachment of a host from Ironic
Adds Force as DetachedAnnotationArgument to API. Detach in all states if force is set to true by aborting the currently running process. Co-Authored-By: Dmitry Tantsur <dtantsur@protonmail.com> Signed-off-by: CrystalChun <cchun@redhat.com> Signed-off-by: Dmitry Tantsur <dtantsur@protonmail.com>
1 parent c70ba87 commit e79a4e7

10 files changed

Lines changed: 120 additions & 36 deletions

File tree

apis/metal3.io/v1alpha1/baremetalhost_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,11 @@ const (
685685
type DetachedAnnotationArguments struct {
686686
// DeleteAction indicates the desired delete logic when the detached annotation is present
687687
DeleteAction DetachedDeleteAction `json:"deleteAction,omitempty"`
688+
689+
// Force indicates if detaching should be forced regardless of the host's state
690+
// +optional
691+
// +kubebuilder:default:=false
692+
Force bool `json:"force,omitempty"`
688693
}
689694

690695
// Match compares the saved status information with the name and

internal/controller/metal3.io/baremetalhost_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -617,8 +617,8 @@ func hasCustomDeploy(host *metal3api.BareMetalHost) bool {
617617
}
618618

619619
// detachHost() detaches the host from the Provisioner.
620-
func (r *BareMetalHostReconciler) detachHost(ctx context.Context, prov provisioner.Provisioner, info *reconcileInfo) actionResult {
621-
provResult, err := prov.Detach(ctx)
620+
func (r *BareMetalHostReconciler) detachHost(ctx context.Context, prov provisioner.Provisioner, info *reconcileInfo, force bool) actionResult {
621+
provResult, err := prov.Detach(ctx, force)
622622
if err != nil {
623623
return actionError{fmt.Errorf("failed to detach: %w", err)}
624624
}

internal/controller/metal3.io/host_state_machine.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,11 +314,17 @@ func (hsm *hostStateMachine) checkDetachedHost(ctx context.Context, info *reconc
314314
// provisioner and take no further action
315315
// Note this doesn't change the current state, only the OperationalStatus
316316
if hasDetachedAnnotation(hsm.Host) {
317-
// Only allow detaching hosts in Provisioned/ExternallyProvisioned/Ready/Available states
317+
// Only allow detaching hosts in Provisioned/ExternallyProvisioned/Ready/Available states unless forced
318318
switch info.host.Status.Provisioning.State {
319319
case metal3api.StateProvisioned, metal3api.StateExternallyProvisioned, metal3api.StateReady, metal3api.StateAvailable:
320-
return hsm.Reconciler.detachHost(ctx, hsm.Provisioner, info)
320+
return hsm.Reconciler.detachHost(ctx, hsm.Provisioner, info, false)
321+
case metal3api.StateDeleting:
322+
// No point in detaching a host that is being deleted already
321323
default:
324+
if hasForceDetachAnnotation(hsm.Host) {
325+
info.log.Info("forcing detach of host", "provisioningState", info.host.Status.Provisioning.State)
326+
return hsm.Reconciler.detachHost(ctx, hsm.Provisioner, info, true)
327+
}
322328
info.log.Info("host cannot be detached yet, waiting for the current operation to finish", "provisioningState", info.host.Status.Provisioning.State)
323329
}
324330
}
@@ -340,6 +346,20 @@ func (hsm *hostStateMachine) checkDetachedHost(ctx context.Context, info *reconc
340346
return nil
341347
}
342348

349+
func hasForceDetachAnnotation(host *metal3api.BareMetalHost) bool {
350+
annotations := host.GetAnnotations()
351+
if annotations != nil {
352+
if val, ok := annotations[metal3api.DetachedAnnotation]; ok {
353+
args := metal3api.DetachedAnnotationArguments{}
354+
if err := json.Unmarshal([]byte(val), &args); err != nil {
355+
return false
356+
}
357+
return args.Force
358+
}
359+
}
360+
return false
361+
}
362+
343363
func (hsm *hostStateMachine) ensureRegistered(ctx context.Context, info *reconcileInfo) (result actionResult) {
344364
if !hsm.haveCreds {
345365
// If we are in the process of deletion (which may start with

internal/controller/metal3.io/host_state_machine_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,11 +1379,11 @@ func (m *mockProvisioner) Deprovision(_ context.Context, _ bool, _ metal3api.Aut
13791379
return m.getNextResultByMethod("Deprovision"), err
13801380
}
13811381

1382-
func (m *mockProvisioner) Delete(context.Context) (result provisioner.Result, err error) {
1382+
func (m *mockProvisioner) Delete(_ context.Context) (result provisioner.Result, err error) {
13831383
return m.getNextResultByMethod("Delete"), err
13841384
}
13851385

1386-
func (m *mockProvisioner) Detach(context.Context) (result provisioner.Result, err error) {
1386+
func (m *mockProvisioner) Detach(_ context.Context, _ bool) (result provisioner.Result, err error) {
13871387
res := m.getNextResultByMethod("Detach")
13881388
return res, err
13891389
}

pkg/provisioner/demo/demo.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ func (p *demoProvisioner) Delete(_ context.Context) (result provisioner.Result,
287287
// for the target system. It may be called multiple times,
288288
// and should return true for its dirty flag until the
289289
// deletion operation is completed.
290-
func (p *demoProvisioner) Detach(_ context.Context) (result provisioner.Result, err error) {
290+
func (p *demoProvisioner) Detach(_ context.Context, _ bool) (result provisioner.Result, err error) {
291291
p.log.Info("detaching host")
292292
return result, nil
293293
}

pkg/provisioner/fixture/fixture.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ func (p *fixtureProvisioner) Delete(_ context.Context) (result provisioner.Resul
349349
// for the target system. It may be called multiple times,
350350
// and should return true for its dirty flag until the
351351
// deletion operation is completed.
352-
func (p *fixtureProvisioner) Detach(ctx context.Context) (result provisioner.Result, err error) {
352+
func (p *fixtureProvisioner) Detach(ctx context.Context, _ bool) (result provisioner.Result, err error) {
353353
return p.Delete(ctx)
354354
}
355355

pkg/provisioner/ironic/clients/features.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,15 @@ func (af AvailableFeatures) HasDisablePowerOff() bool {
5656
return af.MaxVersion >= 95 //nolint:mnd
5757
}
5858

59+
func (af AvailableFeatures) HasDeploymentAbort() bool {
60+
return af.MaxVersion >= 110 //nolint:mnd
61+
}
62+
5963
func (af AvailableFeatures) ChooseMicroversion() string {
64+
if af.HasDeploymentAbort() {
65+
return "1.110"
66+
}
67+
6068
if af.HasDisablePowerOff() {
6169
return "1.95"
6270
}

pkg/provisioner/ironic/delete_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ func deleteTest(t *testing.T, detach bool) {
217217

218218
var result provisioner.Result
219219
if detach {
220-
result, err = prov.Detach(t.Context())
220+
result, err = prov.Detach(t.Context(), false)
221221
} else {
222222
result, err = prov.Delete(t.Context())
223223
}

pkg/provisioner/ironic/ironic.go

Lines changed: 77 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,23 +1527,27 @@ func (p *ironicProvisioner) Delete(ctx context.Context) (result provisioner.Resu
15271527
"target", ironicNode.TargetProvisionState,
15281528
"deploy step", ironicNode.DeployStep,
15291529
)
1530+
return p.realDelete(ctx, ironicNode, false)
1531+
}
15301532

1533+
func (p *ironicProvisioner) realDelete(ctx context.Context, ironicNode *nodes.Node, force bool) (result provisioner.Result, err error) {
15311534
currentProvState := nodes.ProvisionState(ironicNode.ProvisionState)
15321535

1533-
// Handle verifying state specially: Ironic holds an exclusive lock during
1534-
// verification, so we can't set maintenance mode or delete until it completes.
1535-
// Just wait for the verification to finish (success or timeout).
1536-
if currentProvState == nodes.Verifying {
1536+
switch currentProvState {
1537+
case nodes.Verifying:
1538+
// Handle verifying state specially: Ironic holds an exclusive lock during
1539+
// verification, so we can't set maintenance mode or delete until it completes.
1540+
// Just wait for the verification to finish (success or timeout).
15371541
p.log.Info("node is verifying, waiting for verification to complete before deletion")
15381542
return operationContinuing(provisionRequeueDelay)
1539-
}
15401543

1541-
// For enroll state, the node can be deleted directly without maintenance mode
1542-
// since it has no Nova associations and isn't locked.
1543-
if currentProvState == nodes.Enroll {
1544+
case nodes.Enroll:
1545+
// For enroll state, the node can be deleted directly without maintenance mode
1546+
// since it has no Nova associations and isn't locked.
15441547
p.log.Info("node is in enroll state, proceeding to delete directly")
15451548
// Fall through to deletion
1546-
} else if currentProvState == nodes.Available || currentProvState == nodes.Manageable {
1549+
1550+
case nodes.Available, nodes.Manageable:
15471551
// Make sure we don't have a stale instance UUID
15481552
if ironicNode.InstanceUUID != "" {
15491553
var success bool
@@ -1555,20 +1559,51 @@ func (p *ironicProvisioner) Delete(ctx context.Context) (result provisioner.Resu
15551559
return result, err
15561560
}
15571561
}
1558-
} else if !ironicNode.Maintenance {
1559-
// If we see an active node and the controller doesn't think
1560-
// we need to deprovision it, that means the node was
1561-
// ExternallyProvisioned and we should remove it from Ironic
1562-
// without deprovisioning it.
1563-
//
1564-
// If we see a node with an error, we will have to set the
1565-
// maintenance flag before deleting it.
1566-
//
1567-
// Any other state requires us to use maintenance mode to
1568-
// delete while bypassing Ironic's internal checks related to
1569-
// Nova.
1570-
p.log.Info("setting host maintenance flag to force image delete")
1571-
return p.setMaintenanceFlag(ctx, ironicNode, true, "forcing deletion in baremetal-operator")
1562+
1563+
case nodes.Deploying, nodes.Cleaning, nodes.Inspecting, nodes.Servicing, nodes.Deleting:
1564+
p.log.Info("node is in state that does not allow deletion, waiting", "currentState", currentProvState)
1565+
return operationContinuing(provisionRequeueDelay)
1566+
1567+
case nodes.DeployWait:
1568+
if force && !p.availableFeatures.HasDeploymentAbort() {
1569+
p.log.Info("deprovisioning to force deletion")
1570+
// No new API - fall back to deprovisioning and wait for CLEANWAIT
1571+
return p.changeNodeProvisionState(ctx, ironicNode,
1572+
nodes.ProvisionStateOpts{Target: nodes.TargetDeleted},
1573+
)
1574+
}
1575+
1576+
// Otherwise, use the abort API as well
1577+
fallthrough
1578+
1579+
case nodes.CleanWait, nodes.ServiceWait:
1580+
if force {
1581+
p.log.Info("aborting the current operation to force deletion", "currentState", currentProvState)
1582+
return p.changeNodeProvisionState(ctx, ironicNode,
1583+
nodes.ProvisionStateOpts{Target: nodes.TargetAbort},
1584+
)
1585+
}
1586+
1587+
// Normal deletion won't work in these states, so wait
1588+
p.log.Info("node is in state that does not allow deletion, waiting", "currentState", currentProvState)
1589+
return operationContinuing(provisionRequeueDelay)
1590+
1591+
default:
1592+
if !ironicNode.Maintenance {
1593+
// If we see an active node and the controller doesn't think
1594+
// we need to deprovision it, that means the node was
1595+
// ExternallyProvisioned and we should remove it from Ironic
1596+
// without deprovisioning it.
1597+
//
1598+
// If we see a node with an error, we will have to set the
1599+
// maintenance flag before deleting it.
1600+
//
1601+
// Any other state requires us to use maintenance mode to
1602+
// delete while bypassing Ironic's internal checks related to
1603+
// Nova.
1604+
p.log.Info("setting host maintenance flag to force image delete", "currentState", currentProvState)
1605+
return p.setMaintenanceFlag(ctx, ironicNode, true, "forcing deletion in baremetal-operator")
1606+
}
15721607
}
15731608

15741609
p.log.Info("host ready to be removed")
@@ -1592,10 +1627,26 @@ func (p *ironicProvisioner) Delete(ctx context.Context) (result provisioner.Resu
15921627
// for the target system. It may be called multiple times,
15931628
// and should return true for its dirty flag until the
15941629
// deletion operation is completed.
1595-
func (p *ironicProvisioner) Detach(ctx context.Context) (result provisioner.Result, err error) {
1630+
func (p *ironicProvisioner) Detach(ctx context.Context, force bool) (result provisioner.Result, err error) {
15961631
// Currently the same behavior as Delete()
1597-
p.log.Info("removing the node for detachment", "node", p.nodeID)
1598-
return p.Delete(ctx)
1632+
ironicNode, err := p.getNode(ctx)
1633+
if err != nil {
1634+
if errors.Is(err, provisioner.ErrNeedsRegistration) {
1635+
p.log.Info("no node found, already deleted")
1636+
return operationComplete()
1637+
}
1638+
return transientError(err)
1639+
}
1640+
1641+
p.log.Info("deleting host for detachment",
1642+
"ID", ironicNode.UUID,
1643+
"lastError", ironicNode.LastError,
1644+
"current", ironicNode.ProvisionState,
1645+
"target", ironicNode.TargetProvisionState,
1646+
"deploy step", ironicNode.DeployStep,
1647+
"force", force,
1648+
)
1649+
return p.realDelete(ctx, ironicNode, force)
15991650
}
16001651

16011652
// softPowerOffUnsupportedError is returned when the BMC does not

pkg/provisioner/provisioner.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ type Provisioner interface {
189189
// for the target system. It may be called multiple times,
190190
// and should return true for its dirty flag until the
191191
// deletion operation is completed.
192-
Detach(ctx context.Context) (result Result, err error)
192+
Detach(ctx context.Context, force bool) (result Result, err error)
193193

194194
// PowerOn ensures the server is powered on independently of any image
195195
// provisioning operation.

0 commit comments

Comments
 (0)