Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 26 additions & 37 deletions controllers/clusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"

"github.com/NVIDIA/k8s-operator-libs/pkg/consts"

gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
"github.com/NVIDIA/gpu-operator/internal/conditions"
)
Expand Down Expand Up @@ -97,11 +95,9 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques

// Fetch the ClusterPolicy instance
instance := &gpuv1.ClusterPolicy{}
var condErr error
err := r.Get(ctx, req.NamespacedName, instance)
if err != nil {
err = fmt.Errorf("failed to get ClusterPolicy object: %v", err)
r.Log.Error(nil, err.Error())
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
err = fmt.Errorf("failed to get ClusterPolicy object: %w", err)
r.Log.Error(err, "unable to fetch ClusterPolicy")
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
if apierrors.IsNotFound(err) {
// Request object not found, could have been deleted after reconcile request.
Expand All @@ -110,9 +106,8 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
return reconcile.Result{}, nil
}
// Error reading the object - requeue the request.
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
r.Log.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, err
}
Expand All @@ -123,16 +118,13 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
instance.SetStatus(gpuv1.Ignored, clusterPolicyCtrl.operatorNamespace)
// do not change `clusterPolicyCtrl.operatorMetrics.reconciliationStatus` here,
// spurious reconciliation
return ctrl.Result{}, err
return ctrl.Result{}, nil
}

err = clusterPolicyCtrl.init(ctx, r, instance)
if err != nil {
err = fmt.Errorf("failed to initialize ClusterPolicy controller: %v", err)
r.Log.Error(nil, err.Error())
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if err := clusterPolicyCtrl.init(ctx, r, instance); err != nil {
r.Log.Error(err, "unable to initialize ClusterPolicy controller")
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
r.Log.Error(condErr, "failed to set condition")
}
if clusterPolicyCtrl.operatorMetrics != nil {
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
Expand All @@ -159,11 +151,10 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Failed to reconcile %s: %s", clusterPolicyCtrl.stateNames[clusterPolicyCtrl.idx], statusError.Error()))
if condErr != nil {
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Failed to reconcile %s: %s", clusterPolicyCtrl.stateNames[clusterPolicyCtrl.idx], statusError.Error())); condErr != nil {
r.Log.Error(condErr, "failed to set condition")
}
return ctrl.Result{RequeueAfter: time.Second * 5}, statusError
return ctrl.Result{}, statusError
}

if status == gpuv1.NotReady {
Expand All @@ -179,17 +170,16 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
}
}

// if any state is not ready, requeue for reconfile after 5 seconds
// if any state is not ready, requeue for reconcile after 5 seconds
if overallStatus != gpuv1.Ready {
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()

errStr := fmt.Sprintf("ClusterPolicy is not ready, states not ready: %v", statesNotReady)
r.Log.Error(nil, errStr)
err := fmt.Errorf("ClusterPolicy is not ready, states not ready: %v", statesNotReady)
r.Log.Error(err, "ClusterPolicy not yet ready")
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, errStr)
if condErr != nil {
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, err.Error()); condErr != nil {
r.Log.Error(condErr, "failed to set condition")
}
return ctrl.Result{RequeueAfter: time.Second * 5}, nil
}
Expand All @@ -203,9 +193,8 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques

// Update CR state as ready as all states are complete
updateCRState(ctx, r, req.NamespacedName, gpuv1.Ready)
condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NFDLabelsMissing, "No NFD labels found")
if condErr != nil {
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NFDLabelsMissing, "No NFD labels found"); condErr != nil {
r.Log.Error(condErr, "failed to set condition")
}

clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusSuccess)
Expand All @@ -222,13 +211,15 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
if !clusterPolicyCtrl.hasGPUNodes {
infoStr = "No GPU node found, watching for new nodes to join the cluster."
r.Log.Info(infoStr, "hasNFDLabels", clusterPolicyCtrl.hasNFDLabels)
if condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NoGPUNodes, infoStr); condErr != nil {
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NoGPUNodes, infoStr); condErr != nil {
r.Log.Error(condErr, "failed to set condition")
return ctrl.Result{}, condErr
}
} else {
infoStr = "ClusterPolicy is ready as all resources have been successfully reconciled"
r.Log.Info(infoStr)
if condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, infoStr); condErr != nil {
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, infoStr); condErr != nil {
r.Log.Error(condErr, "failed to set condition")
return ctrl.Result{}, condErr
}
}
Expand All @@ -238,8 +229,7 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
func updateCRState(ctx context.Context, r *ClusterPolicyReconciler, namespacedName types.NamespacedName, state gpuv1.State) {
// Fetch latest instance and update state to avoid version mismatch
instance := &gpuv1.ClusterPolicy{}
err := r.Get(ctx, namespacedName, instance)
if err != nil {
if err := r.Get(ctx, namespacedName, instance); err != nil {
r.Log.Error(err, "Failed to get ClusterPolicy instance for status update")
}
if instance.Status.State == state {
Expand All @@ -248,8 +238,7 @@ func updateCRState(ctx context.Context, r *ClusterPolicyReconciler, namespacedNa
}
// Update the CR state
instance.SetStatus(state, clusterPolicyCtrl.operatorNamespace)
err = r.Client.Status().Update(ctx, instance)
if err != nil {
if err := r.Client.Status().Update(ctx, instance); err != nil {
r.Log.Error(err, "Failed to update ClusterPolicy status")
}
}
Expand Down
95 changes: 41 additions & 54 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package controllers

import (
"context"
"errors"
"fmt"
"maps"
"time"
Expand Down Expand Up @@ -79,47 +80,41 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request

// Get the NvidiaDriver instance from this request
instance := &nvidiav1alpha1.NVIDIADriver{}
var condErr error
err := r.Get(ctx, req.NamespacedName, instance)
if err != nil {
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
if apierrors.IsNotFound(err) {
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
// Return and don't requeue
return reconcile.Result{}, nil
}
err = fmt.Errorf("error getting NVIDIADriver object: %w", err)
logger.V(consts.LogLevelError).Error(nil, err.Error())
wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err)
logger.Error(err, "error getting NVIDIADriver object")
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, wrappedErr.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
// Error reading the object - requeue the request.
return reconcile.Result{}, err
return reconcile.Result{}, wrappedErr
}

// Get the singleton NVIDIA ClusterPolicy object in the cluster.
clusterPolicyList := &gpuv1.ClusterPolicyList{}
err = r.List(ctx, clusterPolicyList)
if err != nil {
err = fmt.Errorf("error getting ClusterPolicy list: %v", err)
logger.V(consts.LogLevelError).Error(nil, err.Error())
if err := r.List(ctx, clusterPolicyList); err != nil {
err = fmt.Errorf("error getting ClusterPolicy list: %w", err)
logger.Error(err, "error getting ClusterPolicy list")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean

Suggested change
err = fmt.Errorf("error getting ClusterPolicy list: %w", err)
logger.Error(err, "error getting ClusterPolicy list")
wrappedError := fmt.Errorf("error getting ClusterPolicy list: %w", err)
logger.Error(err, "error getting ClusterPolicy list")

to align with the similar code block from above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we can do that here as well. Updated the code.

instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, fmt.Errorf("error getting ClusterPolicyList: %v", err)
return reconcile.Result{}, err
}

if len(clusterPolicyList.Items) == 0 {
err = fmt.Errorf("no ClusterPolicy object found in the cluster")
logger.V(consts.LogLevelError).Error(nil, err.Error())
err := fmt.Errorf("no ClusterPolicy object found in the cluster")
logger.Error(err, "no ClusterPolicy object found in the cluster")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: What will get printed here? I assume our message will be printed twice?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, msg and error keys will have the same message. I updated it now to have different values. Here is what it will log now:

{"level":"error","ts":1762442249.3265705,"msg":"failed to get ClusterPolicy object","controller":"nvidia-driver-controller","object":{"name":"test1"},"namespace":"","name":"test1","reconcileID":"c1345edc-ac0c-4417-841f-d71f101c5071","error":"no ClusterPolicy object found in the cluster"}
{"level":"error","ts":1762442249.3296852,"msg":"Reconciler error","controller":"nvidia-driver-controller","object":{"name":"test1"},"namespace":"","name":"test1","reconcileID":"c1345edc-ac0c-4417-841f-d71f101c5071","error":"no ClusterPolicy object found in the cluster"}

instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, err
}
Expand All @@ -137,34 +132,30 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
// Verify the nodeSelector configured for this NVIDIADriver instance does
// not conflict with any other instances. This ensures only one driver
// is deployed per GPU node.
err = r.nodeSelectorValidator.Validate(ctx, instance)
if err != nil {
logger.V(consts.LogLevelError).Error(nil, err.Error())
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ConflictingNodeSelector, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if err := r.nodeSelectorValidator.Validate(ctx, instance); err != nil {
logger.Error(err, "nodeSelector validation failed")
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ConflictingNodeSelector, err.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, nil
}

if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
err = fmt.Errorf("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
logger.V(consts.LogLevelError).Error(nil, err.Error())
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
logger.Error(err, "unsupported driver combination detected")
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, nil
}

if instance.Spec.IsGDSEnabled() && instance.Spec.IsOpenKernelModulesRequired() && !instance.Spec.IsOpenKernelModulesEnabled() {
err = fmt.Errorf("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'useOpenKernelModules=true' to enable OpenRM mode", instance.Spec.GPUDirectStorage.Version)
logger.V(consts.LogLevelError).Error(nil, err.Error())
err := fmt.Errorf("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'useOpenKernelModules=true' to enable OpenRM mode", instance.Spec.GPUDirectStorage.Version)
logger.Error(err, "unsupported driver combination detected")
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, nil
}
Expand All @@ -173,12 +164,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
secretName := instance.Spec.SecretEnv
if len(secretName) > 0 {
key := client.ObjectKey{Namespace: r.Namespace, Name: secretName}
err = r.Get(ctx, key, &corev1.Secret{})
if err != nil {
logger.V(consts.LogLevelError).Error(nil, err.Error())
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if err := r.Get(ctx, key, &corev1.Secret{}); err != nil {
logger.Error(err, "failed to get secret")
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
return reconcile.Result{}, nil
}
Expand All @@ -188,8 +177,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)

// update CR status
err = r.updateCrStatus(ctx, instance, managerStatus)
if err != nil {
if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil {
return ctrl.Result{}, err
}

Expand All @@ -199,24 +187,23 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
for _, result := range managerStatus.StatesStatus {
if result.Status != state.SyncStateReady && result.ErrInfo != nil {
errorInfo = result.ErrInfo
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, errorInfo.Error()))
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, errorInfo.Error())); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
break
}
}
// if no errors are reported from any state, then we would be waiting on driver daemonset pods
if errorInfo == nil {
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready")
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready"); condErr != nil {
logger.Error(condErr, "failed to set condition")
}
}
return reconcile.Result{RequeueAfter: time.Second * 5}, nil
}

if condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
logger.Error(condErr, "failed to set condition")
return ctrl.Result{}, condErr
}
return reconcile.Result{}, nil
Expand Down Expand Up @@ -244,7 +231,7 @@ func (r *NVIDIADriverReconciler) updateCrStatus(
reqLogger.V(consts.LogLevelInfo).Info("Updating CR Status", "Status", instance.Status)
err = r.Status().Update(ctx, instance)
if err != nil {
reqLogger.V(consts.LogLevelError).Error(err, "Failed to update CR status")
reqLogger.Error(err, "Failed to update CR status")
return err
}
return nil
Expand Down
4 changes: 2 additions & 2 deletions controllers/upgrade_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
clusterPolicy := &gpuv1.ClusterPolicy{}
err := r.Get(ctx, req.NamespacedName, clusterPolicy)
if err != nil {
reqLogger.V(consts.LogLevelError).Error(err, "Error getting ClusterPolicy object")
reqLogger.Error(err, "Error getting ClusterPolicy object")
if clusterPolicyCtrl.operatorMetrics != nil {
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
}
Expand Down Expand Up @@ -218,7 +218,7 @@ func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) er
delete(node.Labels, upgradeStateLabel)
err = r.Update(ctx, node)
if err != nil {
r.Log.V(consts.LogLevelError).Error(
r.Log.Error(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can we update this to just be one line?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated.

err, "Failed to reset upgrade state label from node", "node", node)
return err
}
Expand Down