Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# Image URL to use all building/pushing image targets
CONTROLLER_IMG ?= controller:latest
METALPROBE_IMG ?= metalprobe:latest
BMCTOOLS_IMG ?= bmctools:latest

# Docker image name for the mkdocs based local development setup
IMAGE=ironcore-dev/metal-operator-docs
Expand Down Expand Up @@ -183,15 +182,10 @@ docker-build-controller-manager: ## Build controller-manager.
docker-build-metalprobe: ## Build metalprobe.
docker build --target probe -t ${METALPROBE_IMG} .

.PHONY: docker-build-bmctools
docker-build-bmctools: ## Build bmctools.
docker build --target bmctools -t ${BMCTOOLS_IMG} .

.PHONY: docker-push
docker-push: ## Push docker image with the manager.
$(CONTAINER_TOOL) push ${CONTROLLER_IMG}
$(CONTAINER_TOOL) push ${METALPROBE_IMG}
$(CONTAINER_TOOL) push ${BMCTOOLS_IMG}

# PLATFORMS defines the target platforms for the manager image be built to provide support to multiple
# architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to:
Expand Down
34 changes: 0 additions & 34 deletions cmd/bmctools/cmds/cmds.go

This file was deleted.

64 changes: 0 additions & 64 deletions cmd/bmctools/cmds/reset.go

This file was deleted.

19 changes: 0 additions & 19 deletions cmd/bmctools/main.go

This file was deleted.

20 changes: 12 additions & 8 deletions internal/bmcutils/bmcutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,18 @@ func GetServerNameFromBMCandIndex(index int, bmcObj *metalv1alpha1.BMC) string {
}

func SSHResetBMC(ctx context.Context, ip, manufacturer, username, password string, timeout time.Duration) error {
// If Redfish reset fails, try SSH-based reset for known manufacturers
// SSH-based reset for BMC recovery scenarios
// Note: Using InsecureIgnoreHostKey() is a pragmatic choice for BMC reset operations:
// - BMC resets typically occur when the BMC is already experiencing issues
// - This is infrastructure management in a controlled environment
// - Host key management adds operational complexity for automated recovery
// - The risk is mitigated by: credential-based auth, internal network, and limited scope
// TODO: Consider adding opt-in secure host key verification via stored keys for enhanced security
config := &ssh.ClientConfig{
User: username,
Auth: []ssh.AuthMethod{ssh.Password(password)},
HostKeyCallback: func(hostname string, remote net.Addr, key ssh.PublicKey) error {
return nil
},
Timeout: timeout,
User: username,
Auth: []ssh.AuthMethod{ssh.Password(password)},
HostKeyCallback: ssh.InsecureIgnoreHostKey(), // #nosec G106 - See security note above
Timeout: timeout,
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
resetCMD := ""
switch manufacturer {
Expand Down Expand Up @@ -256,7 +260,7 @@ func SSHResetBMC(ctx context.Context, ip, manufacturer, username, password strin
defer func() {
_ = session.Close()
}()
// cancel reset cmd after 5 minutes
// cancel reset cmd after timeout
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
done := make(chan error, 1)
Expand Down
97 changes: 84 additions & 13 deletions internal/controller/bmc_controller.go
Comment thread
stefanhipfel marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ func (r *BMCReconciler) reconcile(ctx context.Context, bmcObj *metalv1alpha1.BMC
}
if r.waitForBMCReset(bmcObj, r.BMCResetWaitTime) {
log.V(1).Info("Skipped BMC reconciliation while waiting for BMC reset to complete")
err := r.updateBMCState(ctx, bmcObj, metalv1alpha1.BMCStatePending)
err := r.updateBMCStateToPending(ctx, bmcObj)
if err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -468,12 +468,12 @@ func (r *BMCReconciler) shouldResetBMC(bmcObj *metalv1alpha1.BMC) bool {
return false
}

func (r *BMCReconciler) updateBMCState(ctx context.Context, bmcObj *metalv1alpha1.BMC, state metalv1alpha1.BMCState) error {
if bmcObj.Status.State == state {
func (r *BMCReconciler) updateBMCStateToPending(ctx context.Context, bmcObj *metalv1alpha1.BMC) error {
if bmcObj.Status.State == metalv1alpha1.BMCStatePending {
return nil
}
bmcBase := bmcObj.DeepCopy()
bmcObj.Status.State = state
bmcObj.Status.State = metalv1alpha1.BMCStatePending
if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil {
return fmt.Errorf("failed to patch BMC state to Pending: %w", err)
}
Expand All @@ -489,19 +489,90 @@ func (r *BMCReconciler) resetBMC(ctx context.Context, bmcObj *metalv1alpha1.BMC,
if bmcClient != nil {
if err = bmcClient.ResetManager(ctx, bmcObj.Spec.BMCUUID, schemas.GracefulRestartResetType); err == nil {
log.Info("Successfully reset BMC via Redfish", "BMC", bmcObj.Name)
return r.updateBMCState(ctx, bmcObj, metalv1alpha1.BMCStatePending)
return r.updateBMCStateToPending(ctx, bmcObj)
}
}
// BMC Unavailable, currently can not perform reset. try to reset with ssh when available
log.Error(err, "failed to reset BMC via Redfish, falling back to rest via ssh", "BMC", bmcObj.Name)
if httpErr, ok := err.(*schemas.Error); ok {
// only handle 5xx errors
if httpErr.HTTPReturnedStatusCode < 500 || httpErr.HTTPReturnedStatusCode >= 600 {
return errors.Join(r.updateBMCState(ctx, bmcObj, metalv1alpha1.BMCStatePending), fmt.Errorf("cannot reset bmc: %w", err))
// BMC Unavailable, currently can not perform reset via Redfish
log.Error(err, "Could not reset BMC via Redfish", "BMC", bmcObj.Name)

// Try SSH reset for 5xx server errors (BMC is having issues)
// For 4xx client errors (auth/permission), SSH won't help
if httpErr, ok := err.(*schemas.Error); ok {
if httpErr.HTTPReturnedStatusCode >= 500 && httpErr.HTTPReturnedStatusCode < 600 {
// 5xx server error - attempt SSH-based reset via goroutine with proper context
// The bmcResetConditionType condition (set to True above) acts as idempotency guard:
// - waitForBMCReset() will prevent new resets during the delay period
// - After delay, we retry BMC connection
// - Condition is cleared when connection succeeds (via handlePreviousBMCResetAnnotations)
resetCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
go func() {
defer cancel()
r.resetBMCViaSSH(resetCtx, bmcObj)
}()
log.Info("Initiated SSH-based BMC reset in background due to Redfish 5xx error", "BMC", bmcObj.Name)
return r.updateBMCStateToPending(ctx, bmcObj)
}
// 4xx or other status code - don't try SSH
return errors.Join(r.updateBMCStateToPending(ctx, bmcObj), fmt.Errorf("cannot reset bmc: %w", err))
}
} else {
return fmt.Errorf("cannot reset bmc, unknown error: %w", err)
}
// bmcClient is nil - cannot perform any reset
return errors.Join(r.updateBMCStateToPending(ctx, bmcObj), fmt.Errorf("cannot reset bmc: client unavailable"))
}

func (r *BMCReconciler) resetBMCViaSSH(ctx context.Context, bmcObj *metalv1alpha1.BMC) {
log := ctrl.LoggerFrom(ctx).WithValues("BMC", bmcObj.Name)
log.V(1).Info("Starting SSH-based BMC reset")

currentBMC := &metalv1alpha1.BMC{}
if err := r.Get(ctx, client.ObjectKey{Name: bmcObj.Name}, currentBMC); err != nil {
log.Error(err, "Failed to fetch BMC object for SSH reset")
return
}
address, err := bmcutils.GetBMCAddressForBMC(ctx, r.Client, currentBMC)
if err != nil {
log.Error(err, "Failed to get BMC address for SSH reset")
_ = r.updateConditions(ctx, currentBMC, false, bmcResetConditionType, corev1.ConditionFalse, bmcConnectionFailedReason, fmt.Sprintf("Failed to get BMC address: %v", err))
return
}
if currentBMC.Status.Manufacturer == "" {
log.Error(nil, "BMC manufacturer not available for SSH reset")
_ = r.updateConditions(ctx, currentBMC, false, bmcResetConditionType, corev1.ConditionFalse, bmcInternalErrorReason, "BMC manufacturer not available")
return
}
username, password, err := bmcutils.GetBMCCredentialsForBMCSecretName(ctx, r.Client, currentBMC.Spec.BMCSecretRef.Name)
if err != nil {
log.Error(err, "Failed to get BMC credentials for SSH reset")
_ = r.updateConditions(ctx, currentBMC, false, bmcResetConditionType, corev1.ConditionFalse, bmcAuthenticationFailedReason, fmt.Sprintf("Failed to get credentials: %v", err))
return
}
sshTimeout := 2 * time.Minute
Comment thread
stefanhipfel marked this conversation as resolved.
Outdated
if err := bmcutils.SSHResetBMC(ctx, address, currentBMC.Status.Manufacturer, username, password, sshTimeout); err != nil {
log.Error(err, "Failed to reset BMC via SSH")
_ = r.updateConditions(ctx, currentBMC, false, bmcResetConditionType, corev1.ConditionFalse, bmcInternalErrorReason, fmt.Sprintf("SSH reset failed: %v", err))
return
}
log.Info("Successfully reset BMC via SSH")

// Note: We don't clear the condition here. The condition will be cleared automatically
// when BMC connection succeeds in the next reconciliation via handlePreviousBMCResetAnnotations()

if err := r.updateLastResetTime(ctx, currentBMC); err != nil {
log.Error(err, "Failed to update LastResetTime after SSH reset")
}
}

func (r *BMCReconciler) updateLastResetTime(ctx context.Context, bmcObj *metalv1alpha1.BMC) error {
currentBMC := &metalv1alpha1.BMC{}
if err := r.Get(ctx, client.ObjectKey{Name: bmcObj.Name}, currentBMC); err != nil {
return fmt.Errorf("failed to fetch BMC: %w", err)
}
bmcBase := currentBMC.DeepCopy()
now := metav1.Now()
currentBMC.Status.LastResetTime = &now
if err := r.Status().Patch(ctx, currentBMC, client.MergeFrom(bmcBase)); err != nil {
return fmt.Errorf("failed to patch LastResetTime: %w", err)
}
return nil
}
Comment thread
stefanhipfel marked this conversation as resolved.
Outdated

Expand Down
Loading