Skip to content
This repository was archived by the owner on Sep 19, 2022. It is now read-only.

Commit e8d4d04

Browse files
authored
Merge pull request #151 from johnugeorge/crdchanges
Implement ActiveDeadlineSeconds and BackoffLimit
2 parents 261dd72 + 511af4c commit e8d4d04

File tree

11 files changed

+538
-23
lines changed

11 files changed

+538
-23
lines changed

Gopkg.lock

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/pytorch/v1beta2/types.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,16 @@ type PyTorchJob struct {
4242

4343
// PyTorchJobSpec is a desired state description of the PyTorchJob.
4444
type PyTorchJobSpec struct {
45+
// Specifies the duration in seconds relative to the startTime that the job may be active
46+
// before the system tries to terminate it; value must be positive integer.
47+
// This method applies only to pods with restartPolicy == OnFailure or Always.
48+
// +optional
49+
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`
50+
51+
// Optional number of retries before marking this job failed.
52+
// +optional
53+
BackoffLimit *int32 `json:"backoffLimit,omitempty"`
54+
4555
// CleanPodPolicy defines the policy to kill pods after PyTorchJob is
4656
// succeeded.
4757
// Default to Running.

pkg/apis/pytorch/v1beta2/zz_generated.deepcopy.go

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/common/util/v1beta2/testutil/job.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package testutil
1717
import (
1818
"time"
1919

20+
"github.com/golang/protobuf/proto"
2021
"k8s.io/api/core/v1"
2122
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2223

@@ -50,9 +51,42 @@ func NewPyTorchJobWithCleanupJobDelay(master, worker int, ttl *int32) *v1beta2.P
5051
return job
5152
}
5253

54+
func NewPyTorchJobWithActiveDeadlineSeconds(master, worker int, ads *int64) *v1beta2.PyTorchJob {
55+
if master == 1 {
56+
job := NewPyTorchJobWithMaster(worker)
57+
job.Spec.ActiveDeadlineSeconds = ads
58+
policy := common.CleanPodPolicyAll
59+
job.Spec.CleanPodPolicy = &policy
60+
return job
61+
}
62+
job := NewPyTorchJob(worker)
63+
job.Spec.ActiveDeadlineSeconds = ads
64+
policy := common.CleanPodPolicyAll
65+
job.Spec.CleanPodPolicy = &policy
66+
return job
67+
}
68+
69+
func NewPyTorchJobWithBackoffLimit(master, worker int, backoffLimit *int32) *v1beta2.PyTorchJob {
70+
if master == 1 {
71+
job := NewPyTorchJobWithMaster(worker)
72+
job.Spec.BackoffLimit = backoffLimit
73+
job.Spec.PyTorchReplicaSpecs["Worker"].RestartPolicy = "OnFailure"
74+
policy := common.CleanPodPolicyAll
75+
job.Spec.CleanPodPolicy = &policy
76+
return job
77+
}
78+
job := NewPyTorchJob(worker)
79+
job.Spec.BackoffLimit = backoffLimit
80+
job.Spec.PyTorchReplicaSpecs["Worker"].RestartPolicy = "OnFailure"
81+
policy := common.CleanPodPolicyAll
82+
job.Spec.CleanPodPolicy = &policy
83+
return job
84+
}
85+
5386
func NewPyTorchJobWithMaster(worker int) *v1beta2.PyTorchJob {
5487
job := NewPyTorchJob(worker)
5588
job.Spec.PyTorchReplicaSpecs[v1beta2.PyTorchReplicaTypeMaster] = &common.ReplicaSpec{
89+
Replicas: proto.Int32(1),
5690
Template: NewPyTorchReplicaSpecTemplate(),
5791
}
5892
return job

pkg/common/util/v1beta2/testutil/pod.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,18 @@ func NewPodList(count int32, status v1.PodPhase, job *v1beta2.PyTorchJob, typ st
6464
return pods
6565
}
6666

67-
func SetPodsStatuses(podIndexer cache.Indexer, job *v1beta2.PyTorchJob, typ string, pendingPods, activePods, succeededPods, failedPods int32, t *testing.T) {
67+
func SetPodsStatuses(podIndexer cache.Indexer, job *v1beta2.PyTorchJob, typ string, pendingPods, activePods, succeededPods, failedPods int32, restartCounts []int32, t *testing.T) {
6868
var index int32
6969
for _, pod := range NewPodList(pendingPods, v1.PodPending, job, typ, index, t) {
7070
if err := podIndexer.Add(pod); err != nil {
7171
t.Errorf("%s: unexpected error when adding pod %v", job.Name, err)
7272
}
7373
}
7474
index += pendingPods
75-
for _, pod := range NewPodList(activePods, v1.PodRunning, job, typ, index, t) {
75+
for i, pod := range NewPodList(activePods, v1.PodRunning, job, typ, index, t) {
76+
if restartCounts != nil {
77+
pod.Status.ContainerStatuses = []v1.ContainerStatus{{RestartCount: restartCounts[i]}}
78+
}
7679
if err := podIndexer.Add(pod); err != nil {
7780
t.Errorf("%s: unexpected error when adding pod %v", job.Name, err)
7881
}

pkg/controller.v1beta2/pytorch/controller.go

Lines changed: 113 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package pytorch
1717

1818
import (
1919
"fmt"
20+
"strings"
2021
"time"
2122

2223
kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned"
@@ -38,8 +39,10 @@ import (
3839
jobinformers "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions"
3940
jobinformersv1beta2 "github.com/kubeflow/pytorch-operator/pkg/client/informers/externalversions/pytorch/v1beta2"
4041
joblisters "github.com/kubeflow/pytorch-operator/pkg/client/listers/pytorch/v1beta2"
42+
common "github.com/kubeflow/tf-operator/pkg/apis/common/v1beta2"
4143
"github.com/kubeflow/tf-operator/pkg/common/jobcontroller"
4244
pylogger "github.com/kubeflow/tf-operator/pkg/logger"
45+
"github.com/kubeflow/tf-operator/pkg/util/k8sutil"
4346
)
4447

4548
const (
@@ -326,18 +329,15 @@ func (pc *PyTorchController) syncPyTorchJob(key string) (bool, error) {
326329
return true, err
327330
}
328331

329-
func getTotalReplicas(obj metav1.Object) int32 {
330-
job := obj.(*v1beta2.PyTorchJob)
331-
jobReplicas := int32(0)
332-
for _, r := range job.Spec.PyTorchReplicaSpecs {
333-
jobReplicas += *r.Replicas
334-
}
335-
return jobReplicas
336-
}
337-
338332
// reconcilePyTorchJobs checks and updates replicas for each given PyTorchReplicaSpec.
339333
// It will requeue the job in case of an error while creating/deleting pods/services.
340334
func (pc *PyTorchController) reconcilePyTorchJobs(job *v1beta2.PyTorchJob) error {
335+
jobKey, err := KeyFunc(job)
336+
if err != nil {
337+
utilruntime.HandleError(fmt.Errorf("couldn't get key for pytorch job object %#v: %v", job, err))
338+
return err
339+
}
340+
341341
logger := pylogger.LoggerForJob(job)
342342
logger.Infof("Reconcile PyTorchJobs %s", job.Name)
343343

@@ -355,8 +355,46 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *v1beta2.PyTorchJob) error
355355
return err
356356
}
357357

358+
// retrieve the previous number of retry
359+
previousRetry := pc.WorkQueue.NumRequeues(jobKey)
360+
361+
activePods := k8sutil.FilterActivePods(pods)
362+
active := int32(len(activePods))
363+
failed := int32(k8sutil.FilterPods(pods, v1.PodFailed))
364+
totalReplicas := getTotalReplicas(job)
365+
prevReplicasFailedNum := getTotalFailedReplicas(job)
366+
367+
var failureMessage string
368+
jobExceedsLimit := false
369+
exceedsBackoffLimit := false
370+
pastBackoffLimit := false
371+
372+
if job.Spec.BackoffLimit != nil {
373+
jobHasNewFailure := failed > prevReplicasFailedNum
374+
// new failures happen when status does not reflect the failures and active
375+
// is different than parallelism, otherwise the previous controller loop
376+
// failed updating status so even if we pick up failure it is not a new one
377+
exceedsBackoffLimit = jobHasNewFailure && (active != totalReplicas) &&
378+
(int32(previousRetry)+1 > *job.Spec.BackoffLimit)
379+
380+
pastBackoffLimit, err = pc.pastBackoffLimit(job, pods)
381+
if err != nil {
382+
return err
383+
}
384+
}
385+
386+
if exceedsBackoffLimit || pastBackoffLimit {
387+
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
388+
// OR if the number of failed jobs increased since the last syncJob
389+
jobExceedsLimit = true
390+
failureMessage = fmt.Sprintf("PyTorchJob %s has failed because it has reached the specified backoff limit", job.Name)
391+
} else if pc.pastActiveDeadline(job) {
392+
failureMessage = fmt.Sprintf("PyTorchJob %s has failed because it was active longer than specified deadline", job.Name)
393+
jobExceedsLimit = true
394+
}
395+
358396
// If the PyTorchJob is terminated, delete all pods and services.
359-
if isSucceeded(job.Status) || isFailed(job.Status) {
397+
if isSucceeded(job.Status) || isFailed(job.Status) || jobExceedsLimit {
360398
if err := pc.deletePodsAndServices(job, pods); err != nil {
361399
return err
362400
}
@@ -375,7 +413,18 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *v1beta2.PyTorchJob) error
375413

376414
}
377415
}
378-
416+
if jobExceedsLimit {
417+
pc.Recorder.Event(job, v1.EventTypeNormal, pytorchJobFailedReason, failureMessage)
418+
if job.Status.CompletionTime == nil {
419+
now := metav1.Now()
420+
job.Status.CompletionTime = &now
421+
}
422+
err := updatePyTorchJobConditions(job, common.JobFailed, pytorchJobFailedReason, failureMessage)
423+
if err != nil {
424+
logger.Infof("Append pytorchjob condition error: %v", err)
425+
return err
426+
}
427+
}
379428
// At this point the pods may have been deleted, so if the job succeeded, we need to manually set the replica status.
380429
// If any replicas are still Active, set their status to succeeded.
381430
if isSucceeded(job.Status) {
@@ -434,6 +483,59 @@ func (pc *PyTorchController) satisfiedExpectations(job *v1beta2.PyTorchJob) bool
434483
return satisfied
435484
}
436485

486+
// pastBackoffLimitOnFailure checks if container restartCounts sum exceeds BackoffLimit
487+
// this method applies only to pods with restartPolicy == OnFailure or Always
488+
func (pc *PyTorchController) pastBackoffLimit(job *v1beta2.PyTorchJob, pods []*v1.Pod) (bool, error) {
489+
if job.Spec.BackoffLimit == nil {
490+
return false, nil
491+
}
492+
logger := pylogger.LoggerForJob(job)
493+
result := int32(0)
494+
for rtype, spec := range job.Spec.PyTorchReplicaSpecs {
495+
if spec.RestartPolicy != common.RestartPolicyOnFailure && spec.RestartPolicy != common.RestartPolicyAlways {
496+
logger.Warnf("The restart policy of replica %v of the job %v is not OnFailure or Always. Not counted in backoff limit.", rtype, job.Name)
497+
continue
498+
}
499+
// Convert PyTorchReplicaType to lower string.
500+
rt := strings.ToLower(string(rtype))
501+
pods, err := pc.FilterPodsForReplicaType(pods, rt)
502+
if err != nil {
503+
return false, err
504+
}
505+
for i := range pods {
506+
po := pods[i]
507+
if po.Status.Phase != v1.PodRunning {
508+
continue
509+
}
510+
for j := range po.Status.InitContainerStatuses {
511+
stat := po.Status.InitContainerStatuses[j]
512+
result += stat.RestartCount
513+
}
514+
for j := range po.Status.ContainerStatuses {
515+
stat := po.Status.ContainerStatuses[j]
516+
result += stat.RestartCount
517+
}
518+
}
519+
}
520+
521+
if *job.Spec.BackoffLimit == 0 {
522+
return result > 0, nil
523+
}
524+
return result >= *job.Spec.BackoffLimit, nil
525+
}
526+
527+
// pastActiveDeadline checks if job has ActiveDeadlineSeconds field set and if it is exceeded.
528+
func (pc *PyTorchController) pastActiveDeadline(job *v1beta2.PyTorchJob) bool {
529+
if job.Spec.ActiveDeadlineSeconds == nil || job.Status.StartTime == nil {
530+
return false
531+
}
532+
now := metav1.Now()
533+
start := job.Status.StartTime.Time
534+
duration := now.Time.Sub(start)
535+
allowedDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds) * time.Second
536+
return duration >= allowedDuration
537+
}
538+
437539
func (pc *PyTorchController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) {
438540
return pc.getPyTorchJobFromName(namespace, name)
439541
}

pkg/controller.v1beta2/pytorch/controller_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,8 @@ func TestNormalPath(t *testing.T) {
272272
}
273273

274274
podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer()
275-
testutil.SetPodsStatuses(podIndexer, job, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, t)
276-
testutil.SetPodsStatuses(podIndexer, job, testutil.LabelMaster, tc.pendingMasterPods, tc.activeMasterPods, tc.succeededMasterPods, tc.failedMasterPods, t)
275+
testutil.SetPodsStatuses(podIndexer, job, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, nil, t)
276+
testutil.SetPodsStatuses(podIndexer, job, testutil.LabelMaster, tc.pendingMasterPods, tc.activeMasterPods, tc.succeededMasterPods, tc.failedMasterPods, nil, t)
277277

278278
serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer()
279279
testutil.SetServices(serviceIndexer, job, testutil.LabelWorker, tc.activeWorkerServices, t)

pkg/controller.v1beta2/pytorch/job.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,37 @@ func (pc *PyTorchController) updatePyTorchJob(old, cur interface{}) {
105105
if err != nil {
106106
return
107107
}
108+
curPyTorchJob, err := jobFromUnstructured(cur)
109+
if err != nil {
110+
return
111+
}
112+
113+
// never return error
114+
key, err := KeyFunc(curPyTorchJob)
115+
if err != nil {
116+
return
117+
}
118+
108119
log.Infof("Updating pytorchjob: %s", oldPyTorchJob.Name)
109120
pc.enqueuePyTorchJob(cur)
121+
122+
// check if need to add a new rsync for ActiveDeadlineSeconds
123+
if curPyTorchJob.Status.StartTime != nil {
124+
curPyTorchJobADS := curPyTorchJob.Spec.ActiveDeadlineSeconds
125+
if curPyTorchJobADS == nil {
126+
return
127+
}
128+
oldPyTorchJobADS := oldPyTorchJob.Spec.ActiveDeadlineSeconds
129+
if oldPyTorchJobADS == nil || *oldPyTorchJobADS != *curPyTorchJobADS {
130+
now := metav1.Now()
131+
start := curPyTorchJob.Status.StartTime.Time
132+
passed := now.Time.Sub(start)
133+
total := time.Duration(*curPyTorchJobADS) * time.Second
134+
// AddAfter will handle total < passed
135+
pc.WorkQueue.AddAfter(key, total-passed)
136+
log.Infof("job ActiveDeadlineSeconds updated, will rsync after %d seconds", total-passed)
137+
}
138+
}
110139
}
111140

112141
func (pc *PyTorchController) deletePodsAndServices(job *v1beta2.PyTorchJob, pods []*v1.Pod) error {
@@ -160,3 +189,19 @@ func (pc *PyTorchController) cleanupPyTorchJob(job *v1beta2.PyTorchJob) error {
160189
func (pc *PyTorchController) deletePyTorchJob(job *v1beta2.PyTorchJob) error {
161190
return pc.jobClientSet.KubeflowV1beta2().PyTorchJobs(job.Namespace).Delete(job.Name, &metav1.DeleteOptions{})
162191
}
192+
193+
func getTotalReplicas(job *v1beta2.PyTorchJob) int32 {
194+
jobReplicas := int32(0)
195+
for _, r := range job.Spec.PyTorchReplicaSpecs {
196+
jobReplicas += *r.Replicas
197+
}
198+
return jobReplicas
199+
}
200+
201+
func getTotalFailedReplicas(job *v1beta2.PyTorchJob) int32 {
202+
totalFailedReplicas := int32(0)
203+
for rtype := range job.Status.ReplicaStatuses {
204+
totalFailedReplicas += job.Status.ReplicaStatuses[rtype].Failed
205+
}
206+
return totalFailedReplicas
207+
}

0 commit comments

Comments
 (0)