|
| 1 | +// Copyright 2026 NVIDIA CORPORATION |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +package rd |
| 5 | + |
| 6 | +import ( |
| 7 | + "context" |
| 8 | + "fmt" |
| 9 | + "maps" |
| 10 | + "time" |
| 11 | + |
| 12 | + batchv1 "k8s.io/api/batch/v1" |
| 13 | + v1 "k8s.io/api/core/v1" |
| 14 | + "k8s.io/apimachinery/pkg/api/errors" |
| 15 | + "k8s.io/apimachinery/pkg/types" |
| 16 | + "k8s.io/apimachinery/pkg/util/wait" |
| 17 | + "k8s.io/utils/ptr" |
| 18 | + runtimeClient "sigs.k8s.io/controller-runtime/pkg/client" |
| 19 | + |
| 20 | + v2 "github.com/kai-scheduler/KAI-scheduler/pkg/apis/scheduling/v2" |
| 21 | + "github.com/kai-scheduler/KAI-scheduler/pkg/apis/scheduling/v2alpha2" |
| 22 | + pgconstants "github.com/kai-scheduler/KAI-scheduler/pkg/podgrouper/podgrouper/plugins/constants" |
| 23 | +) |
| 24 | + |
| 25 | +const ( |
| 26 | + // JobNameLabel is the label the k8s Job controller sets on every pod it creates. |
| 27 | + JobNameLabel = "batch.kubernetes.io/job-name" |
| 28 | + |
| 29 | + podGroupFetchTimeout = 30 * time.Second |
| 30 | + podGroupFetchPoll = 250 * time.Millisecond |
| 31 | +) |
| 32 | + |
| 33 | +// DistributedBatchJobOptions configures CreateDistributedBatchJob. Every field is optional |
| 34 | +// — pass DistributedBatchJobOptions{} to get a single-pod gang Job with no resource requests. |
| 35 | +type DistributedBatchJobOptions struct { |
| 36 | + // Parallelism is the number of pods the Job spawns. nil means 1. |
| 37 | + Parallelism *int32 |
| 38 | + // MinMember is the PodGroup MinAvailable. nil means Parallelism (gang). |
| 39 | + // Gang: MinMember == Parallelism |
| 40 | + // Elastic: 1 <= MinMember < Parallelism |
| 41 | + MinMember *int32 |
| 42 | + // Resources applied to each pod. Zero value means no requests/limits. |
| 43 | + Resources v1.ResourceRequirements |
| 44 | + // NamePrefix is prepended to the generated Job name. |
| 45 | + NamePrefix string |
| 46 | + // TopologyConstraint is propagated to the auto-created PodGroup via annotations. |
| 47 | + TopologyConstraint *v2alpha2.TopologyConstraint |
| 48 | + // PriorityClassName is set on the pod template; the podgrouper reads it onto the PodGroup. |
| 49 | + PriorityClassName string |
| 50 | + // Preemptibility is set as a Job label; the podgrouper reads it onto the PodGroup. |
| 51 | + Preemptibility v2alpha2.Preemptibility |
| 52 | + // ExtraLabels are merged into pod template labels (e.g. for test filtering). |
| 53 | + ExtraLabels map[string]string |
| 54 | + // PodSpecMutator is applied to the pod template spec after defaults are set. Scale |
| 55 | + // tests use this to inject KWOK tolerations/affinity without importing scale into rd. |
| 56 | + PodSpecMutator func(*v1.PodSpec) |
| 57 | +} |
| 58 | + |
| 59 | +// CreateDistributedBatchJob submits a batch Job annotated with kai.scheduler/batch-min-member |
| 60 | +// so the podgrouper produces a single PodGroup with MinAvailable=opts.MinMember. Returns the |
| 61 | +// Job, the PodGroup (once the podgrouper has created it), and the pods the Job spawned. |
| 62 | +func CreateDistributedBatchJob( |
| 63 | + ctx context.Context, |
| 64 | + kubeClient runtimeClient.Client, |
| 65 | + jobQueue *v2.Queue, |
| 66 | + opts DistributedBatchJobOptions, |
| 67 | +) (*batchv1.Job, *v2alpha2.PodGroup, []*v1.Pod, error) { |
| 68 | + parallelism := ptr.Deref(opts.Parallelism, 1) |
| 69 | + minMember := ptr.Deref(opts.MinMember, parallelism) |
| 70 | + |
| 71 | + job := buildDistributedBatchJob(jobQueue, opts, parallelism, minMember) |
| 72 | + if err := kubeClient.Create(ctx, job); err != nil { |
| 73 | + return nil, nil, nil, fmt.Errorf("create Job: %w", err) |
| 74 | + } |
| 75 | + |
| 76 | + podGroup, err := waitForPodGroup(ctx, kubeClient, job) |
| 77 | + if err != nil { |
| 78 | + return job, nil, nil, err |
| 79 | + } |
| 80 | + |
| 81 | + pods, err := waitForJobPods(ctx, kubeClient, job, parallelism) |
| 82 | + if err != nil { |
| 83 | + return job, podGroup, nil, err |
| 84 | + } |
| 85 | + |
| 86 | + return job, podGroup, pods, nil |
| 87 | +} |
| 88 | + |
| 89 | +func buildDistributedBatchJob( |
| 90 | + jobQueue *v2.Queue, opts DistributedBatchJobOptions, parallelism, minMember int32, |
| 91 | +) *batchv1.Job { |
| 92 | + job := CreateBatchJobObject(jobQueue, opts.Resources) |
| 93 | + job.Name = opts.NamePrefix + job.Name |
| 94 | + job.Spec.Parallelism = ptr.To(parallelism) |
| 95 | + job.Spec.Completions = ptr.To(parallelism) |
| 96 | + |
| 97 | + if job.Annotations == nil { |
| 98 | + job.Annotations = map[string]string{} |
| 99 | + } |
| 100 | + job.Annotations[pgconstants.MinMemberOverrideKey] = fmt.Sprintf("%d", minMember) |
| 101 | + |
| 102 | + if tc := opts.TopologyConstraint; tc != nil { |
| 103 | + if tc.Topology != "" { |
| 104 | + job.Annotations[pgconstants.TopologyKey] = tc.Topology |
| 105 | + } |
| 106 | + if tc.RequiredTopologyLevel != "" { |
| 107 | + job.Annotations[pgconstants.TopologyRequiredPlacementKey] = tc.RequiredTopologyLevel |
| 108 | + } |
| 109 | + if tc.PreferredTopologyLevel != "" { |
| 110 | + job.Annotations[pgconstants.TopologyPreferredPlacementKey] = tc.PreferredTopologyLevel |
| 111 | + } |
| 112 | + } |
| 113 | + |
| 114 | + if opts.Preemptibility != "" { |
| 115 | + job.Labels[pgconstants.PreemptibilityLabelKey] = string(opts.Preemptibility) |
| 116 | + } |
| 117 | + |
| 118 | + if opts.PriorityClassName != "" { |
| 119 | + job.Spec.Template.Spec.PriorityClassName = opts.PriorityClassName |
| 120 | + } |
| 121 | + |
| 122 | + maps.Copy(job.Spec.Template.ObjectMeta.Labels, opts.ExtraLabels) |
| 123 | + |
| 124 | + if opts.PodSpecMutator != nil { |
| 125 | + opts.PodSpecMutator(&job.Spec.Template.Spec) |
| 126 | + } |
| 127 | + |
| 128 | + return job |
| 129 | +} |
| 130 | + |
| 131 | +func waitForPodGroup( |
| 132 | + ctx context.Context, kubeClient runtimeClient.Client, job *batchv1.Job, |
| 133 | +) (*v2alpha2.PodGroup, error) { |
| 134 | + name := PodGroupNameForJob(job) |
| 135 | + pg := &v2alpha2.PodGroup{} |
| 136 | + key := types.NamespacedName{Namespace: job.Namespace, Name: name} |
| 137 | + |
| 138 | + err := wait.PollUntilContextTimeout(ctx, podGroupFetchPoll, podGroupFetchTimeout, true, |
| 139 | + func(ctx context.Context) (bool, error) { |
| 140 | + err := kubeClient.Get(ctx, key, pg) |
| 141 | + if errors.IsNotFound(err) { |
| 142 | + return false, nil |
| 143 | + } |
| 144 | + return err == nil, err |
| 145 | + }) |
| 146 | + if err != nil { |
| 147 | + return nil, fmt.Errorf("wait for PodGroup %s: %w", name, err) |
| 148 | + } |
| 149 | + return pg, nil |
| 150 | +} |
| 151 | + |
| 152 | +func waitForJobPods( |
| 153 | + ctx context.Context, kubeClient runtimeClient.Client, job *batchv1.Job, expected int32, |
| 154 | +) ([]*v1.Pod, error) { |
| 155 | + var pods []*v1.Pod |
| 156 | + err := wait.PollUntilContextTimeout(ctx, podGroupFetchPoll, podGroupFetchTimeout, true, |
| 157 | + func(ctx context.Context) (bool, error) { |
| 158 | + list := &v1.PodList{} |
| 159 | + err := kubeClient.List(ctx, list, |
| 160 | + runtimeClient.InNamespace(job.Namespace), |
| 161 | + runtimeClient.MatchingLabels{JobNameLabel: job.Name}, |
| 162 | + ) |
| 163 | + if err != nil { |
| 164 | + return false, err |
| 165 | + } |
| 166 | + if int32(len(list.Items)) < expected { |
| 167 | + return false, nil |
| 168 | + } |
| 169 | + pods = make([]*v1.Pod, 0, len(list.Items)) |
| 170 | + for i := range list.Items { |
| 171 | + pods = append(pods, &list.Items[i]) |
| 172 | + } |
| 173 | + return true, nil |
| 174 | + }) |
| 175 | + if err != nil { |
| 176 | + return nil, fmt.Errorf("wait for %d pods of Job %s: %w", expected, job.Name, err) |
| 177 | + } |
| 178 | + return pods, nil |
| 179 | +} |
| 180 | + |
| 181 | +// PodGroupNameForJob returns the deterministic name the podgrouper uses for a Job-owned PodGroup. |
| 182 | +func PodGroupNameForJob(job *batchv1.Job) string { |
| 183 | + return fmt.Sprintf("%s-%s-%s", pgconstants.PodGroupNamePrefix, job.Name, job.UID) |
| 184 | +} |
0 commit comments