Skip to content
This repository was archived by the owner on Sep 19, 2022. It is now read-only.

Commit 396fb2f

Browse files
johnugeorgek8s-ci-robot
authored andcommitted
Sync PodGroup fix (#172)
1 parent 999c6ca commit 396fb2f

File tree

8 files changed

+89
-85
lines changed

8 files changed

+89
-85
lines changed

Gopkg.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Gopkg.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ required = [
1919

2020
[[constraint]]
2121
name = "github.com/kubeflow/tf-operator"
22-
version = "v0.5.1"
22+
version = "v0.5.3"
2323

2424
[[constraint]]
2525
name = "github.com/sirupsen/logrus"

pkg/controller.v1/pytorch/controller.go

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -307,14 +307,6 @@ func (pc *PyTorchController) syncPyTorchJob(key string) (bool, error) {
307307
job := sharedJob.DeepCopy()
308308
jobNeedsSync := pc.satisfiedExpectations(job)
309309

310-
if pc.Config.EnableGangScheduling {
311-
minAvailableReplicas := getTotalReplicas(job)
312-
_, err := pc.SyncPodGroup(job, minAvailableReplicas)
313-
if err != nil {
314-
logger.Warnf("Sync PodGroup %v: %v", job.Name, err)
315-
}
316-
}
317-
318310
// Set default for the new job.
319311
scheme.Scheme.Default(job)
320312

@@ -419,13 +411,8 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *pyv1.PyTorchJob) error {
419411
}
420412

421413
if pc.Config.EnableGangScheduling {
422-
pc.Recorder.Event(job, v1.EventTypeNormal, "JobTerminated", "Job is terminated, deleting PodGroup")
423414
if err := pc.DeletePodGroup(job); err != nil {
424-
pc.Recorder.Eventf(job, v1.EventTypeWarning, "FailedDeletePodGroup", "Error deleting: %v", err)
425415
return err
426-
} else {
427-
pc.Recorder.Eventf(job, v1.EventTypeNormal, "SuccessfulDeletePodGroup", "Deleted PodGroup: %v", job.Name)
428-
429416
}
430417
}
431418

@@ -437,11 +424,15 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *pyv1.PyTorchJob) error {
437424
job.Status.ReplicaStatuses[rtype].Active = 0
438425
}
439426
}
440-
// no need to update the job if the status hasn't changed since last time.
441-
if !reflect.DeepEqual(*oldStatus, job.Status) {
442-
return pc.updateStatusHandler(job)
427+
return pc.updateStatusHandler(job)
428+
}
429+
430+
if pc.Config.EnableGangScheduling {
431+
minAvailableReplicas := getTotalReplicas(job)
432+
_, err := pc.SyncPodGroup(job, minAvailableReplicas)
433+
if err != nil {
434+
logger.Warnf("Sync PodGroup %v: %v", job.Name, err)
443435
}
444-
return nil
445436
}
446437

447438
// Save the current state of the replicas
@@ -463,8 +454,11 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *pyv1.PyTorchJob) error {
463454
}
464455
}
465456

466-
// TODO(CPH): Add check here, no need to update the job if the status hasn't changed since last time.
467-
return pc.updateStatusHandler(job)
457+
// No need to update the job if the status hasn't changed since last time.
458+
if !reflect.DeepEqual(*oldStatus, job.Status) {
459+
return pc.updateStatusHandler(job)
460+
}
461+
return nil
468462
}
469463

470464
// satisfiedExpectations returns true if the required adds/dels for the given job have been observed.

pkg/controller.v1beta2/pytorch/controller.go

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -307,14 +307,6 @@ func (pc *PyTorchController) syncPyTorchJob(key string) (bool, error) {
307307
job := sharedJob.DeepCopy()
308308
jobNeedsSync := pc.satisfiedExpectations(job)
309309

310-
if pc.Config.EnableGangScheduling {
311-
minAvailableReplicas := getTotalReplicas(job)
312-
_, err := pc.SyncPodGroup(job, minAvailableReplicas)
313-
if err != nil {
314-
logger.Warnf("Sync PodGroup %v: %v", job.Name, err)
315-
}
316-
}
317-
318310
// Set default for the new job.
319311
scheme.Scheme.Default(job)
320312

@@ -419,13 +411,8 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *v1beta2.PyTorchJob) error
419411
}
420412

421413
if pc.Config.EnableGangScheduling {
422-
pc.Recorder.Event(job, v1.EventTypeNormal, "JobTerminated", "Job is terminated, deleting PodGroup")
423414
if err := pc.DeletePodGroup(job); err != nil {
424-
pc.Recorder.Eventf(job, v1.EventTypeWarning, "FailedDeletePodGroup", "Error deleting: %v", err)
425415
return err
426-
} else {
427-
pc.Recorder.Eventf(job, v1.EventTypeNormal, "SuccessfulDeletePodGroup", "Deleted PodGroup: %v", job.Name)
428-
429416
}
430417
}
431418

@@ -437,11 +424,15 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *v1beta2.PyTorchJob) error
437424
job.Status.ReplicaStatuses[rtype].Active = 0
438425
}
439426
}
440-
// no need to update the job if the status hasn't changed since last time.
441-
if !reflect.DeepEqual(*oldStatus, job.Status) {
442-
return pc.updateStatusHandler(job)
427+
return pc.updateStatusHandler(job)
428+
}
429+
430+
if pc.Config.EnableGangScheduling {
431+
minAvailableReplicas := getTotalReplicas(job)
432+
_, err := pc.SyncPodGroup(job, minAvailableReplicas)
433+
if err != nil {
434+
logger.Warnf("Sync PodGroup %v: %v", job.Name, err)
443435
}
444-
return nil
445436
}
446437

447438
// Save the current state of the replicas
@@ -463,8 +454,11 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *v1beta2.PyTorchJob) error
463454
}
464455
}
465456

466-
// TODO(CPH): Add check here, no need to update the job if the status hasn't changed since last time.
467-
return pc.updateStatusHandler(job)
457+
// No need to update the job if the status hasn't changed since last time.
458+
if !reflect.DeepEqual(*oldStatus, job.Status) {
459+
return pc.updateStatusHandler(job)
460+
}
461+
return nil
468462
}
469463

470464
// satisfiedExpectations returns true if the required adds/dels for the given job have been observed.

vendor/github.com/kubeflow/tf-operator/pkg/apis/common/v1/openapi_generated.go

Lines changed: 12 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/kubeflow/tf-operator/pkg/apis/common/v1/types.go

Lines changed: 27 additions & 22 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)