@@ -18,6 +18,7 @@ package controllers
1818
1919import (
2020 "context"
21+ "errors"
2122 "fmt"
2223 "maps"
2324 "time"
@@ -79,47 +80,41 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
7980
8081 // Get the NvidiaDriver instance from this request
8182 instance := & nvidiav1alpha1.NVIDIADriver {}
82- var condErr error
83- err := r .Get (ctx , req .NamespacedName , instance )
84- if err != nil {
83+ if err := r .Get (ctx , req .NamespacedName , instance ); err != nil {
8584 if apierrors .IsNotFound (err ) {
8685 // Request object not found, could have been deleted after reconcile request.
8786 // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
8887 // Return and don't requeue
8988 return reconcile.Result {}, nil
9089 }
91- err = fmt .Errorf ("error getting NVIDIADriver object: %w" , err )
92- logger .V ( consts . LogLevelError ). Error (nil , err . Error () )
90+ wrappedErr : = fmt .Errorf ("error getting NVIDIADriver object: %w" , err )
91+ logger .Error (err , "error getting NVIDIADriver object" )
9392 instance .Status .State = nvidiav1alpha1 .NotReady
94- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ())
95- if condErr != nil {
96- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
93+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , wrappedErr .Error ()); condErr != nil {
94+ logger .Error (condErr , "failed to set condition" )
9795 }
9896 // Error reading the object - requeue the request.
99- return reconcile.Result {}, err
97+ return reconcile.Result {}, wrappedErr
10098 }
10199
102100 // Get the singleton NVIDIA ClusterPolicy object in the cluster.
103101 clusterPolicyList := & gpuv1.ClusterPolicyList {}
104- err = r .List (ctx , clusterPolicyList )
105- if err != nil {
106- err = fmt .Errorf ("error getting ClusterPolicy list: %v" , err )
107- logger .V (consts .LogLevelError ).Error (nil , err .Error ())
102+ if err := r .List (ctx , clusterPolicyList ); err != nil {
103+ err = fmt .Errorf ("error getting ClusterPolicy list: %w" , err )
104+ logger .Error (err , "error getting ClusterPolicy list" )
108105 instance .Status .State = nvidiav1alpha1 .NotReady
109- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ())
110- if condErr != nil {
111- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
106+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ()); condErr != nil {
107+ logger .Error (condErr , "failed to set condition" )
112108 }
113- return reconcile.Result {}, fmt . Errorf ( "error getting ClusterPolicyList: %v" , err )
109+ return reconcile.Result {}, err
114110 }
115111
116112 if len (clusterPolicyList .Items ) == 0 {
117- err = fmt .Errorf ("no ClusterPolicy object found in the cluster" )
118- logger .V ( consts . LogLevelError ). Error (nil , err . Error () )
113+ err : = fmt .Errorf ("no ClusterPolicy object found in the cluster" )
114+ logger .Error (err , "no ClusterPolicy object found in the cluster" )
119115 instance .Status .State = nvidiav1alpha1 .NotReady
120- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ())
121- if condErr != nil {
122- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
116+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ()); condErr != nil {
117+ logger .Error (condErr , "failed to set condition" )
123118 }
124119 return reconcile.Result {}, err
125120 }
@@ -137,34 +132,30 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
137132 // Verify the nodeSelector configured for this NVIDIADriver instance does
138133 // not conflict with any other instances. This ensures only one driver
139134 // is deployed per GPU node.
140- err = r .nodeSelectorValidator .Validate (ctx , instance )
141- if err != nil {
142- logger .V (consts .LogLevelError ).Error (nil , err .Error ())
143- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ConflictingNodeSelector , err .Error ())
144- if condErr != nil {
145- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
135+ if err := r .nodeSelectorValidator .Validate (ctx , instance ); err != nil {
136+ logger .Error (err , "nodeSelector validation failed" )
137+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ConflictingNodeSelector , err .Error ()); condErr != nil {
138+ logger .Error (condErr , "failed to set condition" )
146139 }
147140 return reconcile.Result {}, nil
148141 }
149142
150143 if instance .Spec .UsePrecompiledDrivers () && (instance .Spec .IsGDSEnabled () || instance .Spec .IsGDRCopyEnabled ()) {
151- err = fmt . Errorf ("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers" )
152- logger .V ( consts . LogLevelError ). Error (nil , err . Error () )
144+ err := errors . New ("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers" )
145+ logger .Error (err , "unsupported driver combination detected" )
153146 instance .Status .State = nvidiav1alpha1 .NotReady
154- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ())
155- if condErr != nil {
156- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
147+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ()); condErr != nil {
148+ logger .Error (condErr , "failed to set condition" )
157149 }
158150 return reconcile.Result {}, nil
159151 }
160152
161153 if instance .Spec .IsGDSEnabled () && instance .Spec .IsOpenKernelModulesRequired () && ! instance .Spec .IsOpenKernelModulesEnabled () {
162- err = fmt .Errorf ("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'useOpenKernelModules=true' to enable OpenRM mode" , instance .Spec .GPUDirectStorage .Version )
163- logger .V ( consts . LogLevelError ). Error (nil , err . Error () )
154+ err : = fmt .Errorf ("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'useOpenKernelModules=true' to enable OpenRM mode" , instance .Spec .GPUDirectStorage .Version )
155+ logger .Error (err , "unsupported driver combination detected" )
164156 instance .Status .State = nvidiav1alpha1 .NotReady
165- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ())
166- if condErr != nil {
167- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
157+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ()); condErr != nil {
158+ logger .Error (condErr , "failed to set condition" )
168159 }
169160 return reconcile.Result {}, nil
170161 }
@@ -173,12 +164,10 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
173164 secretName := instance .Spec .SecretEnv
174165 if len (secretName ) > 0 {
175166 key := client.ObjectKey {Namespace : r .Namespace , Name : secretName }
176- err = r .Get (ctx , key , & corev1.Secret {})
177- if err != nil {
178- logger .V (consts .LogLevelError ).Error (nil , err .Error ())
179- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ())
180- if condErr != nil {
181- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
167+ if err := r .Get (ctx , key , & corev1.Secret {}); err != nil {
168+ logger .Error (err , "failed to get secret" )
169+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , err .Error ()); condErr != nil {
170+ logger .Error (condErr , "failed to set condition" )
182171 }
183172 return reconcile.Result {}, nil
184173 }
@@ -188,8 +177,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
188177 managerStatus := r .stateManager .SyncState (ctx , instance , infoCatalog )
189178
190179 // update CR status
191- err = r .updateCrStatus (ctx , instance , managerStatus )
192- if err != nil {
180+ if err := r .updateCrStatus (ctx , instance , managerStatus ); err != nil {
193181 return ctrl.Result {}, err
194182 }
195183
@@ -199,24 +187,23 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
199187 for _ , result := range managerStatus .StatesStatus {
200188 if result .Status != state .SyncStateReady && result .ErrInfo != nil {
201189 errorInfo = result .ErrInfo
202- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , fmt .Sprintf ("Error syncing state %s: %v" , result .StateName , errorInfo .Error ()))
203- if condErr != nil {
204- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
190+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .ReconcileFailed , fmt .Sprintf ("Error syncing state %s: %v" , result .StateName , errorInfo .Error ())); condErr != nil {
191+ logger .Error (condErr , "failed to set condition" )
205192 }
206193 break
207194 }
208195 }
209196 // if no errors are reported from any state, then we would be waiting on driver daemonset pods
210197 if errorInfo == nil {
211- condErr = r .conditionUpdater .SetConditionsError (ctx , instance , conditions .DriverNotReady , "Waiting for driver pod to be ready" )
212- if condErr != nil {
213- logger .V (consts .LogLevelDebug ).Error (nil , condErr .Error ())
198+ if condErr := r .conditionUpdater .SetConditionsError (ctx , instance , conditions .DriverNotReady , "Waiting for driver pod to be ready" ); condErr != nil {
199+ logger .Error (condErr , "failed to set condition" )
214200 }
215201 }
216202 return reconcile.Result {RequeueAfter : time .Second * 5 }, nil
217203 }
218204
219- if condErr = r .conditionUpdater .SetConditionsReady (ctx , instance , conditions .Reconciled , "All resources have been successfully reconciled" ); condErr != nil {
205+ if condErr := r .conditionUpdater .SetConditionsReady (ctx , instance , conditions .Reconciled , "All resources have been successfully reconciled" ); condErr != nil {
206+ logger .Error (condErr , "failed to set condition" )
220207 return ctrl.Result {}, condErr
221208 }
222209 return reconcile.Result {}, nil
@@ -244,7 +231,7 @@ func (r *NVIDIADriverReconciler) updateCrStatus(
244231 reqLogger .V (consts .LogLevelInfo ).Info ("Updating CR Status" , "Status" , instance .Status )
245232 err = r .Status ().Update (ctx , instance )
246233 if err != nil {
247- reqLogger .V ( consts . LogLevelError ). Error (err , "Failed to update CR status" )
234+ reqLogger .Error (err , "Failed to update CR status" )
248235 return err
249236 }
250237 return nil
0 commit comments