Skip to content

Commit e137c7c

Browse files
committed
fix(task.go): proxmox qmstart is inconsistent and leads to state failure
TaskFailure State is now only set if the action is not qmstart. It looks impossible to detect a failure in qmstart. This fixes the state machine transition in ReconcileVM.
1 parent 5646539 commit e137c7c

4 files changed

Lines changed: 34 additions & 18 deletions

File tree

internal/service/taskservice/task.go

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -103,26 +103,36 @@ func checkAndRetryTask(scope *scope.MachineScope, task *proxmox.Task) (bool, err
103103
case task.IsRunning:
104104
logger.Info("task is still pending", "description", task.Type)
105105
return true, nil
106-
case task.IsSuccessful:
106+
case task.IsSuccessful && task.IsCompleted:
107107
logger.Info("task is a success", "description", task.Type)
108108
scope.ProxmoxMachine.Status.TaskRef = nil
109109
return false, nil
110110
case task.IsFailed:
111-
logger.Info("task failed", "description", task.Type)
112-
113-
// NOTE: When a task fails there is not simple way to understand which operation is failing (e.g. cloning or powering on)
114-
// so we are reporting failures using a dedicated reason until we find a better solution.
115-
var errorMessage string
111+
// Failing tasks are actually red herrings. Some tasks fail, other
112+
// tasks can fail successfully (like qmstart).
113+
// We save the condition so the ReconcileVM statemachine keeps on working.
114+
conditionReason := conditions.GetReason(scope.ProxmoxMachine, infrav1.ProxmoxMachineVirtualMachineProvisionedCondition)
115+
116+
// qmstart can fail and yet actually start the VM. We can not handle qmstart properly.
117+
// In fact qmstart can find a machine already started, because proxmox's api is
118+
// eventually consistent here.
119+
// For all other jobs we do set the condition to failed.
120+
if task.Type != "qmstart" {
121+
logger.Info("task failed", "description", task.Type)
122+
// We notify the user that intervention is required. This should stop the state machine.
123+
conditionReason = infrav1.ProxmoxMachineVirtualMachineProvisionedTaskFailedReason
124+
}
116125

117-
if task.ExitStatus != "OK" {
118-
errorMessage = task.ExitStatus
119-
} else {
120-
errorMessage = "task failed but its exit status is OK; this should not happen"
126+
errorMessage := fmt.Sprintf("%s: %s", task.Type, task.ExitStatus)
127+
if task.ExitStatus == "OK" {
128+
// If you end up here, file a bug with go-proxmox.
129+
errorMessage = fmt.Sprintf("task %s failed but its exit status is OK; this should not happen", task.UPID)
121130
}
131+
122132
conditions.Set(scope.ProxmoxMachine, metav1.Condition{
123133
Type: infrav1.ProxmoxMachineVirtualMachineProvisionedCondition,
124134
Status: metav1.ConditionFalse,
125-
Reason: infrav1.ProxmoxMachineVirtualMachineProvisionedTaskFailedReason,
135+
Reason: conditionReason,
126136
Message: errorMessage,
127137
})
128138

internal/service/vmservice/helpers_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,11 @@ func createBootstrapSecret(t *testing.T, c client.Client, machineScope *scope.Ma
544544
}
545545

546546
func newTask() *proxmox.Task {
547-
return &proxmox.Task{UPID: "result"}
547+
return &proxmox.Task{
548+
UPID: "result",
549+
IsSuccessful: true,
550+
IsCompleted: true,
551+
}
548552
}
549553

550554
func newVMResource() *proxmox.ClusterResource {

internal/service/vmservice/vm.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ func ReconcileVM(ctx context.Context, scope *scope.MachineScope) (infrav1.Virtua
115115
} // VirtualMachineProvisioned reason is WaitingForBootstrapReady
116116

117117
// handle invalid state of the machine
118-
if conditions.GetReason(scope.ProxmoxMachine, infrav1.ProxmoxMachineVirtualMachineProvisionedCondition) == infrav1.ProxmoxMachineVirtualMachineProvisionedVMProvisionFailedReason {
118+
if conditions.GetReason(scope.ProxmoxMachine, infrav1.ProxmoxMachineVirtualMachineProvisionedCondition) == infrav1.ProxmoxMachineVirtualMachineProvisionedVMProvisionFailedReason ||
119+
conditions.GetReason(scope.ProxmoxMachine, infrav1.ProxmoxMachineVirtualMachineProvisionedCondition) == infrav1.ProxmoxMachineVirtualMachineProvisionedTaskFailedReason {
119120
scope.Logger.V(4).Info("invalid proxmoxmachine state", "state", conditions.GetReason(scope.ProxmoxMachine, infrav1.ProxmoxMachineVirtualMachineProvisionedCondition))
120121
// If you end up here, please file a bug report.
121122
return vm, errors.New("invalid state (failed and no error)")

internal/service/vmservice/vm_test.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,8 @@ func TestReconcileVM_EndToEnd(t *testing.T) {
726726
},
727727
}
728728

729+
task := newTask()
730+
729731
// Round 0: no VM exists yet; CloneVM creates one and requeues for the task to complete.
730732
proxmoxClient.EXPECT().GetReservableMemoryBytes(context.Background(), "node1", int64(100)).Return(0, nil).Once()
731733
proxmoxClient.EXPECT().GetReservableMemoryBytes(context.Background(), "node2", int64(100)).Return(^uint64(0), nil).Once()
@@ -739,7 +741,7 @@ func TestReconcileVM_EndToEnd(t *testing.T) {
739741
SnapName: "snap",
740742
Storage: "storage",
741743
Target: "node2",
742-
}).Return(proxmox.VMCloneResponse{NewID: 123, Task: newTask()}, nil).Once()
744+
}).Return(proxmox.VMCloneResponse{NewID: 123, Task: task}, nil).Once()
743745

744746
result, err := ReconcileVM(context.Background(), machineScope)
745747
require.NoError(t, err)
@@ -754,10 +756,9 @@ func TestReconcileVM_EndToEnd(t *testing.T) {
754756
)
755757

756758
// Round 1: clone task complete; ConfigureVM sets VM options and requeues.
757-
proxmoxClient.EXPECT().GetTask(context.Background(), "result").Return(&lutherproxmox.Task{UPID: "result", IsSuccessful: true}, nil).Once()
759+
proxmoxClient.EXPECT().GetTask(context.Background(), "result").Return(task, nil).Once()
758760
proxmoxClient.EXPECT().GetVM(context.Background(), "node2", int64(123)).Return(vm, nil).Once()
759761

760-
task := newTask()
761762
expectedVMConfigureRequest := []interface{}{
762763
proxmox.VirtualMachineOption{Name: optionSockets, Value: *machineScope.ProxmoxMachine.Spec.NumSockets},
763764
proxmox.VirtualMachineOption{Name: optionCores, Value: *machineScope.ProxmoxMachine.Spec.NumCores},
@@ -782,7 +783,7 @@ func TestReconcileVM_EndToEnd(t *testing.T) {
782783
// Round 2: ConfigureVM task has completed; reconcileDisks resizes the boot volume,
783784
// then reconcileIPAddresses advances the state to WaitingForBootstrapData.
784785
// We're not mocking the entirety of a network setup.
785-
proxmoxClient.EXPECT().GetTask(context.Background(), "result").Return(&lutherproxmox.Task{UPID: "result", IsSuccessful: true}, nil).Once()
786+
proxmoxClient.EXPECT().GetTask(context.Background(), "result").Return(task, nil).Once()
786787
proxmoxClient.EXPECT().GetVM(context.Background(), "node2", int64(123)).Return(vm, nil).Once()
787788
proxmoxClient.EXPECT().ResizeDisk(context.Background(), vm, "scsi0", "50G").Return(nil, nil).Once()
788789

@@ -842,7 +843,7 @@ func TestReconcileVM_EndToEnd(t *testing.T) {
842843
vm.Status = lutherproxmox.StatusVirtualMachineRunning
843844
vm.QMPStatus = lutherproxmox.StatusVirtualMachineRunning
844845

845-
proxmoxClient.EXPECT().GetTask(context.Background(), "result").Return(&lutherproxmox.Task{UPID: "result", IsSuccessful: true}, nil).Once()
846+
proxmoxClient.EXPECT().GetTask(context.Background(), "result").Return(task, nil).Once()
846847
proxmoxClient.EXPECT().GetVM(context.Background(), "node2", int64(123)).Return(vm, nil).Once()
847848
proxmoxClient.EXPECT().QemuAgentStatus(context.Background(), vm).Return(nil).Once()
848849
proxmoxClient.EXPECT().CloudInitStatus(context.Background(), vm).Return(false, nil).Once()

0 commit comments

Comments
 (0)