diff --git a/internal/controller/talosupgrade_controller.go b/internal/controller/talosupgrade_controller.go index 7d0a2b7..6f5f754 100644 --- a/internal/controller/talosupgrade_controller.go +++ b/internal/controller/talosupgrade_controller.go @@ -582,32 +582,36 @@ func (r *TalosUpgradeReconciler) findNextNode(ctx context.Context, talosUpgrade targetVersion := talosUpgrade.Spec.Talos.Version - // Use slices.IndexFunc to find first node needing upgrade - idx := slices.IndexFunc(nodes, func(node corev1.Node) bool { + // Iterate through nodes to find first one needing upgrade + for _, node := range nodes { // Skip completed nodes if slices.Contains(talosUpgrade.Status.CompletedNodes, node.Name) { - return false + continue } // Skip failed nodes if slices.ContainsFunc(talosUpgrade.Status.FailedNodes, func(fn tupprv1alpha1.NodeUpgradeStatus) bool { return fn.NodeName == node.Name }) { - return false + continue } // Check if needs upgrade - return r.nodeNeedsUpgrade(ctx, &node, targetVersion) - }) + needsUpgrade, err := r.nodeNeedsUpgrade(ctx, &node, targetVersion) + if err != nil { + // If we cannot determine upgrade need (e.g., certificate errors), return error to retry + logger.Error(err, "Cannot determine if node needs upgrade, will retry", "node", node.Name) + return "", fmt.Errorf("failed to check upgrade status for node %s: %w", node.Name, err) + } - if idx == -1 { - logger.V(1).Info("No nodes need upgrade") - return "", nil + if needsUpgrade { + logger.Info("Node needs upgrade", "node", node.Name) + return node.Name, nil + } } - nodeName := nodes[idx].Name - logger.Info("Node needs upgrade", "node", nodeName) - return nodeName, nil + logger.V(1).Info("No nodes need upgrade") + return "", nil } // getSortedNodes retrieves all nodes in the cluster sorted by name for consistent ordering @@ -626,20 +630,21 @@ func (r *TalosUpgradeReconciler) getSortedNodes(ctx context.Context) ([]corev1.N } // nodeNeedsUpgrade determines if a given node requires an upgrade using Talos client -func (r *TalosUpgradeReconciler) nodeNeedsUpgrade(ctx context.Context, node *corev1.Node, targetVersion string) bool { +// Returns (needsUpgrade, error) - error is non-nil if upgrade need cannot be determined +func (r *TalosUpgradeReconciler) nodeNeedsUpgrade(ctx context.Context, node *corev1.Node, targetVersion string) (bool, error) { logger := log.FromContext(ctx) nodeIP, err := GetNodeIP(node) if err != nil { logger.Error(err, "Failed to get node IP, cannot determine upgrade need", "node", node.Name) - return false // Cannot proceed without IP + return false, fmt.Errorf("failed to get node IP for %s: %w", node.Name, err) } currentVersion, err := r.TalosClient.GetNodeVersion(ctx, nodeIP) if err != nil { logger.Error(err, "Failed to get current version from Talos client, cannot determine upgrade need", "node", node.Name, "nodeIP", nodeIP) - return false // Cannot proceed without Talos info + return false, fmt.Errorf("failed to get node version for %s: %w", node.Name, err) } needsUpgrade := currentVersion != targetVersion @@ -649,7 +654,7 @@ func (r *TalosUpgradeReconciler) nodeNeedsUpgrade(ctx context.Context, node *cor "targetVersion", targetVersion, "needsUpgrade", needsUpgrade) - return needsUpgrade + return needsUpgrade, nil } // createJob creates a Kubernetes Job to perform the Talos upgrade on the specified node