@@ -144,22 +144,37 @@ func (r *TrainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
144144 err = errors .Join (err , statusErr )
145145 }
146146
147- if deadlineResult , deadlineErr := r .reconcileDeadline (ctx , & trainJob ); deadlineErr != nil || deadlineResult .RequeueAfter > 0 {
148- if ! equality .Semantic .DeepEqual (& trainJob .Status , & prevTrainJob .Status ) {
149- return deadlineResult , errors .Join (err , r .client .Status ().Patch (ctx , & trainJob , client .MergeFrom (prevTrainJob )))
147+ // reconcileDeadline may schedule a requeue to check the deadline later. Save the result
148+ // but do NOT return early — RHAI progression tracking must always run.
149+ deadlineResult , deadlineErr := r .reconcileDeadline (ctx , & trainJob )
150+ err = errors .Join (err , deadlineErr )
151+
152+ // Commit upstream status first before RHAI runs, so ReconcileProgression
153+ // re-fetches the latest committed state from the API server.
154+ // TODO(astefanutti): Consider using SSA once controller-runtime client has SSA support
155+ // for sub-resources. See: https://github.com/kubernetes-sigs/controller-runtime/issues/3183
156+ if ! equality .Semantic .DeepEqual (& trainJob .Status , prevTrainJob .Status ) {
157+ if statusErr := r .client .Status ().Patch (ctx , & trainJob , client .MergeFrom (prevTrainJob )); statusErr != nil {
158+ return ctrl.Result {}, errors .Join (err , statusErr )
150159 }
151- return deadlineResult , errors .Join (err , deadlineErr )
152160 }
153161
154- if ! equality .Semantic .DeepEqual (& trainJob .Status , prevTrainJob .Status ) {
155- // TODO(astefanutti): Consider using SSA once controller-runtime client has SSA support
156- // for sub-resources. See: https://github.com/kubernetes-sigs/controller-runtime/issues/3183
157- return ctrl.Result {}, errors .Join (err , r .client .Status ().Patch (ctx , & trainJob , client .MergeFrom (prevTrainJob )))
162+ // RHAI progression tracking runs after upstream status is committed.
163+ // ReconcileProgression re-fetches the TrainJob from the API server to get the
164+ // latest committed state before patching annotations.
165+ result , progressionErr := progression .ReconcileProgression (ctx , r .client , r .apiReader , log , & trainJob )
166+ // Don't join progression errors with upstream errors - progression errors during pod startup
167+ // are expected (pod not ready, no IP yet) and shouldn't block requeueing.
168+ // If progression error exists, log it but don't prevent the requeue.
169+ if progressionErr != nil {
170+ log .V (1 ).Info ("Progression tracking encountered an error (will retry on next reconcile)" , "error" , progressionErr )
158171 }
159172
160- // RHAI progression tracking (use APIReader to avoid pod watches)
161- result , progressionErr := progression .ReconcileProgression (ctx , r .client , r .apiReader , log , & trainJob )
162- return result , errors .Join (err , progressionErr )
173+ // Use the deadline requeue if it is sooner than the progression requeue.
174+ if deadlineResult .RequeueAfter > 0 && (result .RequeueAfter == 0 || deadlineResult .RequeueAfter < result .RequeueAfter ) {
175+ return deadlineResult , err
176+ }
177+ return result , err
163178}
164179
165180func (r * TrainJobReconciler ) reconcileObjects (ctx context.Context , runtime jobruntimes.Runtime , trainJob * trainer.TrainJob ) error {
0 commit comments