@@ -34,10 +34,15 @@ import (
34
34
)
35
35
36
36
const (
37
+ // gang scheduler name.
38
+ gangSchedulerName = "kube-batch"
37
39
// podTemplateRestartPolicyReason is the warning reason when the restart
38
40
// policy is set in pod template.
39
41
podTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy"
40
42
exitedWithCodeReason = "ExitedWithCode"
43
+ // podTemplateSchedulerNameReason is the warning reason when other scheduler name is set
44
+ // in pod templates with gang-scheduling enabled
45
+ podTemplateSchedulerNameReason = "SettedPodTemplateSchedulerName"
41
46
)
42
47
43
48
// reconcilePods checks and updates pods for each given PyTorchReplicaSpec.
@@ -206,6 +211,19 @@ func (pc *PyTorchController) createNewPod(job *v1beta2.PyTorchJob, rtype v1beta2
206
211
}
207
212
setRestartPolicy (podTemplate , spec )
208
213
214
+ // if gang-scheduling is enabled:
215
+ // 1. if user has specified other scheduler, we report a warning without overriding any fields.
216
+ // 2. if no SchedulerName is set for pods, then we set the SchedulerName to "kube-batch".
217
+ if pc .Config .EnableGangScheduling {
218
+ if isNonGangSchedulerSet (job ) {
219
+ errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten"
220
+ logger .Warning (errMsg )
221
+ pc .Recorder .Event (job , v1 .EventTypeWarning , podTemplateSchedulerNameReason , errMsg )
222
+ } else {
223
+ podTemplate .Spec .SchedulerName = gangSchedulerName
224
+ }
225
+ }
226
+
209
227
err = pc .PodControl .CreatePodsWithControllerRef (job .Namespace , podTemplate , job , controllerRef )
210
228
if err != nil && k8serrors .IsTimeout (err ) {
211
229
// Pod is created but its initialization has timed out.
@@ -278,3 +296,12 @@ func setRestartPolicy(podTemplateSpec *v1.PodTemplateSpec, spec *common.ReplicaS
278
296
podTemplateSpec .Spec .RestartPolicy = v1 .RestartPolicy (spec .RestartPolicy )
279
297
}
280
298
}
299
+
300
+ func isNonGangSchedulerSet (job * v1beta2.PyTorchJob ) bool {
301
+ for _ , spec := range job .Spec .PyTorchReplicaSpecs {
302
+ if spec .Template .Spec .SchedulerName != "" && spec .Template .Spec .SchedulerName != gangSchedulerName {
303
+ return true
304
+ }
305
+ }
306
+ return false
307
+ }
0 commit comments