Skip to content

Commit b19e52f

Browse files
committed
fix: prevent progression errors from blocking requeue
1 parent 8011dd3 commit b19e52f

1 file changed

Lines changed: 1 addition & 8 deletions

File tree

pkg/controller/trainjob_controller.go

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,6 @@ func (r *TrainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
149149
deadlineResult, deadlineErr := r.reconcileDeadline(ctx, &trainJob)
150150
err = errors.Join(err, deadlineErr)
151151

152-
// Commit upstream status first before RHAI runs, so ReconcileProgression
153-
// re-fetches the latest committed state from the API server.
154152
// TODO(astefanutti): Consider using SSA once controller-runtime client has SSA support
155153
// for sub-resources. See: https://github.com/kubernetes-sigs/controller-runtime/issues/3183
156154
if !equality.Semantic.DeepEqual(&trainJob.Status, prevTrainJob.Status) {
@@ -159,13 +157,8 @@ func (r *TrainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
159157
}
160158
}
161159

162-
// RHAI progression tracking runs after upstream status is committed.
163-
// ReconcileProgression re-fetches the TrainJob from the API server to get the
164-
// latest committed state before patching annotations.
160+
// RHAI progression tracking (fetches fresh state from API server, logs errors without joining)
165161
result, progressionErr := progression.ReconcileProgression(ctx, r.client, r.apiReader, log, &trainJob)
166-
// Don't join progression errors with upstream errors - progression errors during pod startup
167-
// are expected (pod not ready, no IP yet) and shouldn't block requeueing.
168-
// If progression error exists, log it but don't prevent the requeue.
169162
if progressionErr != nil {
170163
log.V(1).Info("Progression tracking encountered an error (will retry on next reconcile)", "error", progressionErr)
171164
}

0 commit comments

Comments
 (0)