Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions api/v1/nmstate_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ type NMStateSpec struct {
// +kubebuilder:validation:Enum=info;debug
// +optional
LogLevel shared.LogLevel `json:"logLevel,omitempty"`
// RetryConfiguration is an optional configuration for NNCP retry and backoff strategy.
// If RetryConfiguration is specified, the handler will use the config defined here instead of its default values.
// +kubebuilder:default:={}
// +optional
RetryConfiguration NMStateRetryConfiguration `json:"retryConfiguration,omitempty"`
}

type SelfSignConfiguration struct {
Expand Down Expand Up @@ -102,6 +107,26 @@ type NMStateMetricsConfiguration struct {
BindAddress string `json:"bindAddress,omitempty"`
}

type NMStateRetryConfiguration struct {
// InitialBackoff is the initial retry delay when applying network configuration fails.
// The backoff will increase exponentially up to MaximumBackoff.
// +kubebuilder:default:="1s"
// +optional
InitialBackoff metav1.Duration `json:"initialBackoff,omitempty"`
// MaximumBackoff is the maximum retry delay when applying network configuration fails.
// The exponential backoff will not exceed this value.
// +kubebuilder:default:="30s"
// +optional
MaximumBackoff metav1.Duration `json:"maximumBackoff,omitempty"`
// MaxRetries is the maximum number of retry attempts before marking the policy as failed.
// After this many retries, the NNCE will be marked as FailedToConfigure.
// +kubebuilder:default:=5
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=100
// +optional
MaxRetries int `json:"maxRetries,omitempty"`
}

// NMStateStatus defines the observed state of NMState
type NMStateStatus struct {
// +optional
Expand Down
18 changes: 18 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

80 changes: 75 additions & 5 deletions cmd/handler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,62 @@ func cacheResourcesOnNodes(ctrlOptions *ctrl.Options) {
}
}

// Returns initialBackoff, maximumBackoff, maxRetries, and error with the following logic:
// default values if unset, config if set, default values if error or validations failed.
func getRetryConfiguration(
ctx context.Context, apiClient client.Client,
) (initialBackoff, maximumBackoff time.Duration, maxRetries int, err error) {
const (
defaultInitialBackoff = 1 * time.Second
defaultMaximumBackoff = 30 * time.Second
defaultMaxRetries = 5
)

nmstateList := &nmstatev1.NMStateList{}
err = apiClient.List(ctx, nmstateList)
if err != nil {
return defaultInitialBackoff, defaultMaximumBackoff, defaultMaxRetries, errors.Wrap(err, "failed to list NMState CRs")
}

// No NMState CR -> use defaults
if len(nmstateList.Items) == 0 {
return defaultInitialBackoff, defaultMaximumBackoff, defaultMaxRetries, nil
}

// Use the first NMState CR (there should only be one in a typical deployment)
nmstate := nmstateList.Items[0]
retryConfig := nmstate.Spec.RetryConfiguration

initialBackoff = defaultInitialBackoff
if retryConfig.InitialBackoff.Duration != 0 {
initialBackoff = retryConfig.InitialBackoff.Duration
}

maximumBackoff = defaultMaximumBackoff
if retryConfig.MaximumBackoff.Duration != 0 {
maximumBackoff = retryConfig.MaximumBackoff.Duration
}

maxRetries = defaultMaxRetries
if retryConfig.MaxRetries != 0 {
maxRetries = retryConfig.MaxRetries
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current logic for reading the retry configuration from the NMState CR is flawed. It checks if the values are non-zero before using them, which prevents a user from setting a valid value of 0 for maxRetries. If a user sets maxRetries: 0, this code will ignore it and use the default of 5.

The kubebuilder:default annotations on the API type ensure that if a user doesn't specify a value, it will be defaulted by the API server. The code should use the values from the CR directly. For old CRs that don't have this field, the values will be zero, which will be caught by the subsequent validation logic and correctly fall back to defaults.

initialBackoff = retryConfig.InitialBackoff.Duration
maximumBackoff = retryConfig.MaximumBackoff.Duration
maxRetries = retryConfig.MaxRetries


if initialBackoff >= maximumBackoff {
setupLog.Info("Invalid retry configuration: initialBackoff must be less than maximumBackoff, using defaults",
"initialBackoff", initialBackoff, "maximumBackoff", maximumBackoff)
return defaultInitialBackoff, defaultMaximumBackoff, defaultMaxRetries, nil
}

if initialBackoff <= 0 || maximumBackoff <= 0 {
setupLog.Info("Invalid retry configuration: backoff durations must be positive, using defaults",
"initialBackoff", initialBackoff, "maximumBackoff", maximumBackoff)
return defaultInitialBackoff, defaultMaximumBackoff, defaultMaxRetries, nil
}

return initialBackoff, maximumBackoff, maxRetries, nil
}

func setupHandlerControllers(mgr manager.Manager) error {
setupLog.Info("Creating Node controller")
if err := (&controllers.NodeReconciler{
Expand All @@ -278,13 +334,27 @@ func setupHandlerControllers(mgr manager.Manager) error {
return err
}

setupLog.Info("Retrieving retry configuration from NMState CR")
initialBackoff, maximumBackoff, maxRetries, err := getRetryConfiguration(context.Background(), apiClient)
if err != nil {
setupLog.Error(err, "failed to retrieve retry configuration, using defaults")
// Use default values if we can't retrieve the configuration
initialBackoff = 1 * time.Second
maximumBackoff = 30 * time.Second
maxRetries = 5
}
Comment on lines 327 to 329

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The getRetryConfiguration function already returns default values when an error occurs. The if err != nil block here re-assigns the same defaults, which is redundant. You can simplify this by just logging the error and proceeding with the values returned by the function.

if err != nil {
	setupLog.Error(err, "failed to retrieve retry configuration, using defaults")
}

setupLog.Info("Using retry configuration", "initialBackoff", initialBackoff, "maximumBackoff", maximumBackoff, "maxRetries", maxRetries)

setupLog.Info("Creating NodeNetworkConfigurationPolicy controller")
if err = (&controllers.NodeNetworkConfigurationPolicyReconciler{
Client: mgr.GetClient(),
APIClient: apiClient,
Log: ctrl.Log.WithName("controllers").WithName("NodeNetworkConfigurationPolicy"),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor(fmt.Sprintf("%s.nmstate-handler", environment.NodeName())),
Client: mgr.GetClient(),
APIClient: apiClient,
Log: ctrl.Log.WithName("controllers").WithName("NodeNetworkConfigurationPolicy"),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor(fmt.Sprintf("%s.nmstate-handler", environment.NodeName())),
InitialBackoff: initialBackoff,
MaximumBackoff: maximumBackoff,
MaxRetries: maxRetries,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create NodeNetworkConfigurationPolicy controller", "controller", "NMState")
return err
Expand Down
19 changes: 11 additions & 8 deletions controllers/handler/nodenetworkconfigurationpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,13 @@ type NodeNetworkConfigurationPolicyReconciler struct {
client.Client
// APIClient controller-runtime client without cache, it will be used at
// places where whole cluster resources need to be retrieved but not cached.
APIClient client.Client
Log logr.Logger
Scheme *runtime.Scheme
Recorder record.EventRecorder
APIClient client.Client
Log logr.Logger
Scheme *runtime.Scheme
Recorder record.EventRecorder
InitialBackoff time.Duration
MaximumBackoff time.Duration
MaxRetries int
}

func init() {
Expand Down Expand Up @@ -238,15 +241,15 @@ func (r *NodeNetworkConfigurationPolicyReconciler) Reconcile(ctx context.Context
return ctrl.Result{}, err
}

if enactmentInstance.Status.RetryCount[generationKey] >= RetriesUntilFail {
if enactmentInstance.Status.RetryCount[generationKey] >= r.MaxRetries {
enactmentConditions.NotifyFailedToConfigure(ctx, errmsg)
if r.Recorder != nil {
r.Recorder.Event(instance,
corev1.EventTypeWarning,
ReconcileFailed,
fmt.Errorf(
"reconciliation of enactment %q has failed after %d retries",
enactmentInstance.Name, RetriesUntilFail).Error())
enactmentInstance.Name, r.MaxRetries).Error())
}
return ctrl.Result{}, nil
}
Expand All @@ -255,7 +258,7 @@ func (r *NodeNetworkConfigurationPolicyReconciler) Reconcile(ctx context.Context
fmt.Errorf("failed to reconcile NodeNetworkConfigurationPolicy on node %s. Retrying %d/%d",
nodeName,
enactmentInstance.Status.RetryCount[generationKey]+1,
RetriesUntilFail),
r.MaxRetries),
)
return ctrl.Result{Requeue: true}, nil
}
Expand Down Expand Up @@ -303,7 +306,7 @@ func (r *NodeNetworkConfigurationPolicyReconciler) SetupWithManager(mgr ctrl.Man
mgr,
controller.Options{
Reconciler: r,
RateLimiter: workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](time.Second, time.Second*MaximumTimeBackoff),
RateLimiter: workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](r.InitialBackoff, r.MaximumBackoff),
})
if err != nil {
return errors.Wrap(err, "failed to create NodeNetworkConfigurationPolicy controller")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controllers

import (
"context"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -94,7 +95,11 @@ var _ = Describe("NodeNetworkConfigurationPolicy controller predicates", func()
DescribeTable("when claimNodeRunningUpdate is called and",
func(c incrementUnavailableNodeCountCase) {
nmstatectlShowFn = func() (string, error) { return "", nil }
reconciler := NodeNetworkConfigurationPolicyReconciler{}
reconciler := NodeNetworkConfigurationPolicyReconciler{
InitialBackoff: 1 * time.Second,
MaximumBackoff: 30 * time.Second,
MaxRetries: 5,
}
s := scheme.Scheme
s.AddKnownTypes(nmstatev1beta1.GroupVersion,
&nmstatev1beta1.NodeNetworkState{},
Expand Down Expand Up @@ -305,7 +310,11 @@ var _ = Describe("NodeNetworkConfigurationPolicy controller predicates", func()
)

BeforeEach(func() {
reconciler = &NodeNetworkConfigurationPolicyReconciler{}
reconciler = &NodeNetworkConfigurationPolicyReconciler{
InitialBackoff: 1 * time.Second,
MaximumBackoff: 30 * time.Second,
MaxRetries: 5,
}
s = scheme.Scheme
s.AddKnownTypes(nmstatev1.GroupVersion,
&nmstatev1.NodeNetworkConfigurationPolicy{},
Expand Down
27 changes: 27 additions & 0 deletions deploy/crds/nmstate.io_nmstates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1989,6 +1989,33 @@ spec:
- host
type: object
type: object
retryConfiguration:
default: {}
description: |-
RetryConfiguration is an optional configuration for NNCP retry and backoff strategy.
If RetryConfiguration is specified, the handler will use the config defined here instead of its default values.
properties:
initialBackoff:
default: 1s
description: |-
InitialBackoff is the initial retry delay when applying network configuration fails.
The backoff will increase exponentially up to MaximumBackoff.
type: string
maxRetries:
default: 5
description: |-
MaxRetries is the maximum number of retry attempts before marking the policy as failed.
After this many retries, the NNCE will be marked as FailedToConfigure.
maximum: 100
minimum: 0
type: integer
maximumBackoff:
default: 30s
description: |-
MaximumBackoff is the maximum retry delay when applying network configuration fails.
The exponential backoff will not exceed this value.
type: string
type: object
selfSignConfiguration:
description: SelfSignConfiguration defines self signed certificate
configuration
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.