Skip to content

Commit c7225a0

Browse files
committed
fix logging for controllers
1 parent 7fd1346 commit c7225a0

File tree

3 files changed

+69
-93
lines changed

3 files changed

+69
-93
lines changed

controllers/clusterpolicy_controller.go

Lines changed: 26 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@ import (
4141
"sigs.k8s.io/controller-runtime/pkg/reconcile"
4242
"sigs.k8s.io/controller-runtime/pkg/source"
4343

44-
"github.com/NVIDIA/k8s-operator-libs/pkg/consts"
45-
4644
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
4745
"github.com/NVIDIA/gpu-operator/internal/conditions"
4846
)
@@ -97,11 +95,9 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
9795

9896
// Fetch the ClusterPolicy instance
9997
instance := &gpuv1.ClusterPolicy{}
100-
var condErr error
101-
err := r.Get(ctx, req.NamespacedName, instance)
102-
if err != nil {
103-
err = fmt.Errorf("failed to get ClusterPolicy object: %v", err)
104-
r.Log.Error(nil, err.Error())
98+
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
99+
err = fmt.Errorf("failed to get ClusterPolicy object: %w", err)
100+
r.Log.Error(err, "unable to fetch ClusterPolicy")
105101
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
106102
if apierrors.IsNotFound(err) {
107103
// Request object not found, could have been deleted after reconcile request.
@@ -110,9 +106,8 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
110106
return reconcile.Result{}, nil
111107
}
112108
// Error reading the object - requeue the request.
113-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
114-
if condErr != nil {
115-
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
109+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
110+
r.Log.Error(condErr, "failed to set condition")
116111
}
117112
return reconcile.Result{}, err
118113
}
@@ -123,16 +118,13 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
123118
instance.SetStatus(gpuv1.Ignored, clusterPolicyCtrl.operatorNamespace)
124119
// do not change `clusterPolicyCtrl.operatorMetrics.reconciliationStatus` here,
125120
// spurious reconciliation
126-
return ctrl.Result{}, err
121+
return ctrl.Result{}, nil
127122
}
128123

129-
err = clusterPolicyCtrl.init(ctx, r, instance)
130-
if err != nil {
131-
err = fmt.Errorf("failed to initialize ClusterPolicy controller: %v", err)
132-
r.Log.Error(nil, err.Error())
133-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
134-
if condErr != nil {
135-
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
124+
if err := clusterPolicyCtrl.init(ctx, r, instance); err != nil {
125+
r.Log.Error(err, "unable to initialize ClusterPolicy controller")
126+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
127+
r.Log.Error(condErr, "failed to set condition")
136128
}
137129
if clusterPolicyCtrl.operatorMetrics != nil {
138130
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
@@ -159,11 +151,10 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
159151
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
160152
clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()
161153
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
162-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Failed to reconcile %s: %s", clusterPolicyCtrl.stateNames[clusterPolicyCtrl.idx], statusError.Error()))
163-
if condErr != nil {
164-
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
154+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Failed to reconcile %s: %s", clusterPolicyCtrl.stateNames[clusterPolicyCtrl.idx], statusError.Error())); condErr != nil {
155+
r.Log.Error(condErr, "failed to set condition")
165156
}
166-
return ctrl.Result{RequeueAfter: time.Second * 5}, statusError
157+
return ctrl.Result{}, statusError
167158
}
168159

169160
if status == gpuv1.NotReady {
@@ -179,17 +170,16 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
179170
}
180171
}
181172

182-
// if any state is not ready, requeue for reconfile after 5 seconds
173+
// if any state is not ready, requeue for reconcile after 5 seconds
183174
if overallStatus != gpuv1.Ready {
184175
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
185176
clusterPolicyCtrl.operatorMetrics.reconciliationFailed.Inc()
186177

187-
errStr := fmt.Sprintf("ClusterPolicy is not ready, states not ready: %v", statesNotReady)
188-
r.Log.Error(nil, errStr)
178+
err := fmt.Errorf("ClusterPolicy is not ready, states not ready: %v", statesNotReady)
179+
r.Log.Error(err, "ClusterPolicy not yet ready")
189180
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
190-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, errStr)
191-
if condErr != nil {
192-
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
181+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, err.Error()); condErr != nil {
182+
r.Log.Error(condErr, "failed to set condition")
193183
}
194184
return ctrl.Result{RequeueAfter: time.Second * 5}, nil
195185
}
@@ -203,9 +193,8 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
203193

204194
// Update CR state as ready as all states are complete
205195
updateCRState(ctx, r, req.NamespacedName, gpuv1.Ready)
206-
condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NFDLabelsMissing, "No NFD labels found")
207-
if condErr != nil {
208-
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
196+
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NFDLabelsMissing, "No NFD labels found"); condErr != nil {
197+
r.Log.Error(condErr, "failed to set condition")
209198
}
210199

211200
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusSuccess)
@@ -222,13 +211,15 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
222211
if !clusterPolicyCtrl.hasGPUNodes {
223212
infoStr = "No GPU node found, watching for new nodes to join the cluster."
224213
r.Log.Info(infoStr, "hasNFDLabels", clusterPolicyCtrl.hasNFDLabels)
225-
if condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NoGPUNodes, infoStr); condErr != nil {
214+
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.NoGPUNodes, infoStr); condErr != nil {
215+
r.Log.Error(condErr, "failed to set condition")
226216
return ctrl.Result{}, condErr
227217
}
228218
} else {
229219
infoStr = "ClusterPolicy is ready as all resources have been successfully reconciled"
230220
r.Log.Info(infoStr)
231-
if condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, infoStr); condErr != nil {
221+
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, infoStr); condErr != nil {
222+
r.Log.Error(condErr, "failed to set condition")
232223
return ctrl.Result{}, condErr
233224
}
234225
}
@@ -238,8 +229,7 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
238229
func updateCRState(ctx context.Context, r *ClusterPolicyReconciler, namespacedName types.NamespacedName, state gpuv1.State) {
239230
// Fetch latest instance and update state to avoid version mismatch
240231
instance := &gpuv1.ClusterPolicy{}
241-
err := r.Get(ctx, namespacedName, instance)
242-
if err != nil {
232+
if err := r.Get(ctx, namespacedName, instance); err != nil {
243233
r.Log.Error(err, "Failed to get ClusterPolicy instance for status update")
244234
}
245235
if instance.Status.State == state {
@@ -248,8 +238,7 @@ func updateCRState(ctx context.Context, r *ClusterPolicyReconciler, namespacedNa
248238
}
249239
// Update the CR state
250240
instance.SetStatus(state, clusterPolicyCtrl.operatorNamespace)
251-
err = r.Client.Status().Update(ctx, instance)
252-
if err != nil {
241+
if err := r.Client.Status().Update(ctx, instance); err != nil {
253242
r.Log.Error(err, "Failed to update ClusterPolicy status")
254243
}
255244
}

controllers/nvidiadriver_controller.go

Lines changed: 41 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package controllers
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"maps"
2324
"time"
@@ -79,47 +80,41 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
7980

8081
// Get the NvidiaDriver instance from this request
8182
instance := &nvidiav1alpha1.NVIDIADriver{}
82-
var condErr error
83-
err := r.Get(ctx, req.NamespacedName, instance)
84-
if err != nil {
83+
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
8584
if apierrors.IsNotFound(err) {
8685
// Request object not found, could have been deleted after reconcile request.
8786
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
8887
// Return and don't requeue
8988
return reconcile.Result{}, nil
9089
}
91-
err = fmt.Errorf("error getting NVIDIADriver object: %w", err)
92-
logger.V(consts.LogLevelError).Error(nil, err.Error())
90+
wrappedErr := fmt.Errorf("error getting NVIDIADriver object: %w", err)
91+
logger.Error(err, "error getting NVIDIADriver object")
9392
instance.Status.State = nvidiav1alpha1.NotReady
94-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
95-
if condErr != nil {
96-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
93+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, wrappedErr.Error()); condErr != nil {
94+
logger.Error(condErr, "failed to set condition")
9795
}
9896
// Error reading the object - requeue the request.
99-
return reconcile.Result{}, err
97+
return reconcile.Result{}, wrappedErr
10098
}
10199

102100
// Get the singleton NVIDIA ClusterPolicy object in the cluster.
103101
clusterPolicyList := &gpuv1.ClusterPolicyList{}
104-
err = r.List(ctx, clusterPolicyList)
105-
if err != nil {
106-
err = fmt.Errorf("error getting ClusterPolicy list: %v", err)
107-
logger.V(consts.LogLevelError).Error(nil, err.Error())
102+
if err := r.List(ctx, clusterPolicyList); err != nil {
103+
err = fmt.Errorf("error getting ClusterPolicy list: %w", err)
104+
logger.Error(err, "error getting ClusterPolicy list")
108105
instance.Status.State = nvidiav1alpha1.NotReady
109-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
110-
if condErr != nil {
111-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
106+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
107+
logger.Error(condErr, "failed to set condition")
112108
}
113-
return reconcile.Result{}, fmt.Errorf("error getting ClusterPolicyList: %v", err)
109+
return reconcile.Result{}, err
114110
}
115111

116112
if len(clusterPolicyList.Items) == 0 {
117-
err = fmt.Errorf("no ClusterPolicy object found in the cluster")
118-
logger.V(consts.LogLevelError).Error(nil, err.Error())
113+
err := fmt.Errorf("no ClusterPolicy object found in the cluster")
114+
logger.Error(err, "no ClusterPolicy object found in the cluster")
119115
instance.Status.State = nvidiav1alpha1.NotReady
120-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
121-
if condErr != nil {
122-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
116+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
117+
logger.Error(condErr, "failed to set condition")
123118
}
124119
return reconcile.Result{}, err
125120
}
@@ -137,34 +132,30 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
137132
// Verify the nodeSelector configured for this NVIDIADriver instance does
138133
// not conflict with any other instances. This ensures only one driver
139134
// is deployed per GPU node.
140-
err = r.nodeSelectorValidator.Validate(ctx, instance)
141-
if err != nil {
142-
logger.V(consts.LogLevelError).Error(nil, err.Error())
143-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ConflictingNodeSelector, err.Error())
144-
if condErr != nil {
145-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
135+
if err := r.nodeSelectorValidator.Validate(ctx, instance); err != nil {
136+
logger.Error(err, "nodeSelector validation failed")
137+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ConflictingNodeSelector, err.Error()); condErr != nil {
138+
logger.Error(condErr, "failed to set condition")
146139
}
147140
return reconcile.Result{}, nil
148141
}
149142

150143
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
151-
err = fmt.Errorf("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
152-
logger.V(consts.LogLevelError).Error(nil, err.Error())
144+
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
145+
logger.Error(err, "unsupported driver combination detected")
153146
instance.Status.State = nvidiav1alpha1.NotReady
154-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
155-
if condErr != nil {
156-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
147+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
148+
logger.Error(condErr, "failed to set condition")
157149
}
158150
return reconcile.Result{}, nil
159151
}
160152

161153
if instance.Spec.IsGDSEnabled() && instance.Spec.IsOpenKernelModulesRequired() && !instance.Spec.IsOpenKernelModulesEnabled() {
162-
err = fmt.Errorf("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'useOpenKernelModules=true' to enable OpenRM mode", instance.Spec.GPUDirectStorage.Version)
163-
logger.V(consts.LogLevelError).Error(nil, err.Error())
154+
err := fmt.Errorf("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'useOpenKernelModules=true' to enable OpenRM mode", instance.Spec.GPUDirectStorage.Version)
155+
logger.Error(err, "unsupported driver combination detected")
164156
instance.Status.State = nvidiav1alpha1.NotReady
165-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
166-
if condErr != nil {
167-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
157+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
158+
logger.Error(condErr, "failed to set condition")
168159
}
169160
return reconcile.Result{}, nil
170161
}
@@ -173,12 +164,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
173164
secretName := instance.Spec.SecretEnv
174165
if len(secretName) > 0 {
175166
key := client.ObjectKey{Namespace: r.Namespace, Name: secretName}
176-
err = r.Get(ctx, key, &corev1.Secret{})
177-
if err != nil {
178-
logger.V(consts.LogLevelError).Error(nil, err.Error())
179-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
180-
if condErr != nil {
181-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
167+
if err := r.Get(ctx, key, &corev1.Secret{}); err != nil {
168+
logger.Error(err, "failed to get secret")
169+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil {
170+
logger.Error(condErr, "failed to set condition")
182171
}
183172
return reconcile.Result{}, nil
184173
}
@@ -188,8 +177,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
188177
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
189178

190179
// update CR status
191-
err = r.updateCrStatus(ctx, instance, managerStatus)
192-
if err != nil {
180+
if err := r.updateCrStatus(ctx, instance, managerStatus); err != nil {
193181
return ctrl.Result{}, err
194182
}
195183

@@ -199,24 +187,23 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
199187
for _, result := range managerStatus.StatesStatus {
200188
if result.Status != state.SyncStateReady && result.ErrInfo != nil {
201189
errorInfo = result.ErrInfo
202-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, errorInfo.Error()))
203-
if condErr != nil {
204-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
190+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, errorInfo.Error())); condErr != nil {
191+
logger.Error(condErr, "failed to set condition")
205192
}
206193
break
207194
}
208195
}
209196
// if no errors are reported from any state, then we would be waiting on driver daemonset pods
210197
if errorInfo == nil {
211-
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready")
212-
if condErr != nil {
213-
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
198+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready"); condErr != nil {
199+
logger.Error(condErr, "failed to set condition")
214200
}
215201
}
216202
return reconcile.Result{RequeueAfter: time.Second * 5}, nil
217203
}
218204

219-
if condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
205+
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
206+
logger.Error(condErr, "failed to set condition")
220207
return ctrl.Result{}, condErr
221208
}
222209
return reconcile.Result{}, nil
@@ -244,7 +231,7 @@ func (r *NVIDIADriverReconciler) updateCrStatus(
244231
reqLogger.V(consts.LogLevelInfo).Info("Updating CR Status", "Status", instance.Status)
245232
err = r.Status().Update(ctx, instance)
246233
if err != nil {
247-
reqLogger.V(consts.LogLevelError).Error(err, "Failed to update CR status")
234+
reqLogger.Error(err, "Failed to update CR status")
248235
return err
249236
}
250237
return nil

controllers/upgrade_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
8686
clusterPolicy := &gpuv1.ClusterPolicy{}
8787
err := r.Get(ctx, req.NamespacedName, clusterPolicy)
8888
if err != nil {
89-
reqLogger.V(consts.LogLevelError).Error(err, "Error getting ClusterPolicy object")
89+
reqLogger.Error(err, "Error getting ClusterPolicy object")
9090
if clusterPolicyCtrl.operatorMetrics != nil {
9191
clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusClusterPolicyUnavailable)
9292
}
@@ -218,7 +218,7 @@ func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) er
218218
delete(node.Labels, upgradeStateLabel)
219219
err = r.Update(ctx, node)
220220
if err != nil {
221-
r.Log.V(consts.LogLevelError).Error(
221+
r.Log.Error(
222222
err, "Failed to reset upgrade state label from node", "node", node)
223223
return err
224224
}

0 commit comments

Comments
 (0)