Skip to content
This repository was archived by the owner on Sep 19, 2022. It is now read-only.

Commit fd773c0

Browse files
johnugeorgek8s-ci-robot
authored andcommitted
Use kube-batch as scheduler by default when gang-scheduling is enabled (#149)
1 parent c3ae150 commit fd773c0

File tree

1 file changed

+27
-0
lines changed
  • pkg/controller.v1beta2/pytorch

1 file changed

+27
-0
lines changed

pkg/controller.v1beta2/pytorch/pod.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,15 @@ import (
3434
)
3535

3636
const (
37+
// gang scheduler name.
38+
gangSchedulerName = "kube-batch"
3739
// podTemplateRestartPolicyReason is the warning reason when the restart
3840
// policy is set in pod template.
3941
podTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy"
4042
exitedWithCodeReason = "ExitedWithCode"
43+
// podTemplateSchedulerNameReason is the warning reason when other scheduler name is set
44+
// in pod templates with gang-scheduling enabled
45+
podTemplateSchedulerNameReason = "SettedPodTemplateSchedulerName"
4146
)
4247

4348
// reconcilePods checks and updates pods for each given PyTorchReplicaSpec.
@@ -206,6 +211,19 @@ func (pc *PyTorchController) createNewPod(job *v1beta2.PyTorchJob, rtype v1beta2
206211
}
207212
setRestartPolicy(podTemplate, spec)
208213

214+
// if gang-scheduling is enabled:
215+
// 1. if user has specified other scheduler, we report a warning without overriding any fields.
216+
// 2. if no SchedulerName is set for pods, then we set the SchedulerName to "kube-batch".
217+
if pc.Config.EnableGangScheduling {
218+
if isNonGangSchedulerSet(job) {
219+
errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten"
220+
logger.Warning(errMsg)
221+
pc.Recorder.Event(job, v1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg)
222+
} else {
223+
podTemplate.Spec.SchedulerName = gangSchedulerName
224+
}
225+
}
226+
209227
err = pc.PodControl.CreatePodsWithControllerRef(job.Namespace, podTemplate, job, controllerRef)
210228
if err != nil && k8serrors.IsTimeout(err) {
211229
// Pod is created but its initialization has timed out.
@@ -278,3 +296,12 @@ func setRestartPolicy(podTemplateSpec *v1.PodTemplateSpec, spec *common.ReplicaS
278296
podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicy(spec.RestartPolicy)
279297
}
280298
}
299+
300+
func isNonGangSchedulerSet(job *v1beta2.PyTorchJob) bool {
301+
for _, spec := range job.Spec.PyTorchReplicaSpecs {
302+
if spec.Template.Spec.SchedulerName != "" && spec.Template.Spec.SchedulerName != gangSchedulerName {
303+
return true
304+
}
305+
}
306+
return false
307+
}

0 commit comments

Comments
 (0)