Skip to content

Commit a26ff21

Browse files
committed
fix: respect cleanPodPolicy when job exceeds backoffLimit
Move UpdateJobConditions(JobFailed) before DeletePodsAndServices in the jobExceedsLimit block so that IsFinished() returns true and the cleanPodPolicy: None guard is not bypassed. Previously, DeletePodsAndServices was called before the JobFailed condition was set, causing all pods to be unconditionally deleted regardless of cleanPodPolicy when a job exceeded its backoffLimit. Tested on live cluster: pods now preserved after backoffLimit failure with cleanPodPolicy: None. Fixes: #3419 Signed-off-by: Aviad Hayumi <aviad.hayumi@run.ai>
1 parent 9121e6a commit a26ff21

2 files changed

Lines changed: 16 additions & 6 deletions

File tree

pkg/controller.v1/common/job.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,12 @@ func (jc *JobController) ReconcileJobs(
220220
jobStatus.CompletionTime = &now
221221
}
222222

223-
// If the Job exceeds backoff limit or is past active deadline
224-
// delete all pods and services, then set the status to failed
223+
// Set JobFailed condition BEFORE cleanup so that DeletePodsAndServices
224+
// sees IsFinished()==true and correctly respects cleanPodPolicy.
225+
// Fix: https://github.com/kubeflow/trainer/issues/3419
226+
jc.Recorder.Event(runtimeObject, corev1.EventTypeNormal, commonutil.NewReason(jobKind, commonutil.JobFailedReason), failureMessage)
227+
commonutil.UpdateJobConditions(&jobStatus, apiv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(jobKind, commonutil.JobFailedReason), failureMessage)
228+
225229
if err := jc.DeletePodsAndServices(runtimeObject, runPolicy, jobStatus, pods); err != nil {
226230
return err
227231
}
@@ -240,10 +244,6 @@ func (jc *JobController) ReconcileJobs(
240244
}
241245
}
242246

243-
jc.Recorder.Event(runtimeObject, corev1.EventTypeNormal, commonutil.NewReason(jobKind, commonutil.JobFailedReason), failureMessage)
244-
245-
commonutil.UpdateJobConditions(&jobStatus, apiv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(jobKind, commonutil.JobFailedReason), failureMessage)
246-
247247
return jc.Controller.UpdateJobStatusInApiServer(job, &jobStatus)
248248
} else {
249249
// General cases which need to reconcile

pkg/controller.v1/common/job_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,16 @@ func TestDeletePodsAndServices(T *testing.T) {
9797
wantPods: &corev1.PodList{},
9898
wantService: &corev1.ServiceList{},
9999
},
100+
// This test reproduces the backoffLimit bug: when DeletePodsAndServices is called
101+
// without JobFailed condition set (as happens in the jobExceedsLimit path before
102+
// the fix), pods are deleted despite cleanPodPolicy: None.
103+
// See: https://github.com/kubeflow/trainer/issues/3419
104+
"Unfinished job with cleanPodPolicy None deletes pods (pre-fix backoffLimit bug)": {
105+
cleanPodPolicy: apiv1.CleanPodPolicyNone,
106+
jobCondition: "",
107+
wantPods: &corev1.PodList{},
108+
wantService: &corev1.ServiceList{},
109+
},
100110
}
101111
for name, tc := range cases {
102112
T.Run(name, func(t *testing.T) {

0 commit comments

Comments
 (0)