Skip to content

Commit 8011dd3

Browse files
committed
fix: ensure progression tracking runs after upstream status is committed
1 parent c1c24fd commit 8011dd3

2 files changed

Lines changed: 33 additions & 11 deletions

File tree

pkg/controller/trainjob_controller.go

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -144,22 +144,37 @@ func (r *TrainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
144144
err = errors.Join(err, statusErr)
145145
}
146146

147-
if deadlineResult, deadlineErr := r.reconcileDeadline(ctx, &trainJob); deadlineErr != nil || deadlineResult.RequeueAfter > 0 {
148-
if !equality.Semantic.DeepEqual(&trainJob.Status, &prevTrainJob.Status) {
149-
return deadlineResult, errors.Join(err, r.client.Status().Patch(ctx, &trainJob, client.MergeFrom(prevTrainJob)))
147+
// reconcileDeadline may schedule a requeue to check the deadline later. Save the result
148+
// but do NOT return early — RHAI progression tracking must always run.
149+
deadlineResult, deadlineErr := r.reconcileDeadline(ctx, &trainJob)
150+
err = errors.Join(err, deadlineErr)
151+
152+
// Commit upstream status first before RHAI runs, so ReconcileProgression
153+
// re-fetches the latest committed state from the API server.
154+
// TODO(astefanutti): Consider using SSA once controller-runtime client has SSA support
155+
// for sub-resources. See: https://github.com/kubernetes-sigs/controller-runtime/issues/3183
156+
if !equality.Semantic.DeepEqual(&trainJob.Status, prevTrainJob.Status) {
157+
if statusErr := r.client.Status().Patch(ctx, &trainJob, client.MergeFrom(prevTrainJob)); statusErr != nil {
158+
return ctrl.Result{}, errors.Join(err, statusErr)
150159
}
151-
return deadlineResult, errors.Join(err, deadlineErr)
152160
}
153161

154-
if !equality.Semantic.DeepEqual(&trainJob.Status, prevTrainJob.Status) {
155-
// TODO(astefanutti): Consider using SSA once controller-runtime client has SSA support
156-
// for sub-resources. See: https://github.com/kubernetes-sigs/controller-runtime/issues/3183
157-
return ctrl.Result{}, errors.Join(err, r.client.Status().Patch(ctx, &trainJob, client.MergeFrom(prevTrainJob)))
162+
// RHAI progression tracking runs after upstream status is committed.
163+
// ReconcileProgression re-fetches the TrainJob from the API server to get the
164+
// latest committed state before patching annotations.
165+
result, progressionErr := progression.ReconcileProgression(ctx, r.client, r.apiReader, log, &trainJob)
166+
// Don't join progression errors with upstream errors - progression errors during pod startup
167+
// are expected (pod not ready, no IP yet) and shouldn't block requeueing.
168+
// If progression error exists, log it but don't prevent the requeue.
169+
if progressionErr != nil {
170+
log.V(1).Info("Progression tracking encountered an error (will retry on next reconcile)", "error", progressionErr)
158171
}
159172

160-
// RHAI progression tracking (use APIReader to avoid pod watches)
161-
result, progressionErr := progression.ReconcileProgression(ctx, r.client, r.apiReader, log, &trainJob)
162-
return result, errors.Join(err, progressionErr)
173+
// Use the deadline requeue if it is sooner than the progression requeue.
174+
if deadlineResult.RequeueAfter > 0 && (result.RequeueAfter == 0 || deadlineResult.RequeueAfter < result.RequeueAfter) {
175+
return deadlineResult, err
176+
}
177+
return result, err
163178
}
164179

165180
func (r *TrainJobReconciler) reconcileObjects(ctx context.Context, runtime jobruntimes.Runtime, trainJob *trainer.TrainJob) error {

pkg/rhai/progression/progression.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,13 @@ func ReconcileProgression(ctx context.Context, c client.Client, reader client.Re
721721
return ctrl.Result{}, nil
722722
}
723723

724+
// Re-fetch the TrainJob from the API server to ensure we have the latest state,
725+
// including any status updates committed by the upstream reconcile, before
726+
// patching annotations.
727+
if err := reader.Get(ctx, client.ObjectKeyFromObject(trainJob), trainJob); err != nil {
728+
return ctrl.Result{}, client.IgnoreNotFound(err)
729+
}
730+
724731
isRunning := !meta.IsStatusConditionTrue(trainJob.Status.Conditions, trainer.TrainJobSuspended) &&
725732
!meta.IsStatusConditionTrue(trainJob.Status.Conditions, trainer.TrainJobComplete) &&
726733
!meta.IsStatusConditionTrue(trainJob.Status.Conditions, trainer.TrainJobFailed)

0 commit comments

Comments
 (0)