Skip to content

Commit 1667bcb

Browse files
authored
Extract Trainjob from the Kubeflow directory (#7081)
1 parent cd031d1 commit 1667bcb

File tree

16 files changed

+109
-109
lines changed

16 files changed

+109
-109
lines changed

charts/kueue/templates/webhook/manifests.yaml

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -242,26 +242,6 @@ webhooks:
242242
- xgboostjobs
243243
sideEffects: None
244244
reinvocationPolicy: '{{ .Values.mutatingWebhook.reinvocationPolicy }}'
245-
- admissionReviewVersions:
246-
- v1
247-
clientConfig:
248-
service:
249-
name: '{{ include "kueue.fullname" . }}-webhook-service'
250-
namespace: '{{ .Release.Namespace }}'
251-
path: /mutate-trainer-kubeflow-org-v1alpha1-trainjob
252-
failurePolicy: Fail
253-
name: mtrainjob.kb.io
254-
rules:
255-
- apiGroups:
256-
- trainer.kubeflow.org
257-
apiVersions:
258-
- v1alpha1
259-
operations:
260-
- CREATE
261-
resources:
262-
- trainjobs
263-
sideEffects: None
264-
reinvocationPolicy: '{{ .Values.mutatingWebhook.reinvocationPolicy }}'
265245
- admissionReviewVersions:
266246
- v1
267247
clientConfig:
@@ -430,6 +410,26 @@ webhooks:
430410
- statefulsets
431411
sideEffects: None
432412
reinvocationPolicy: '{{ .Values.mutatingWebhook.reinvocationPolicy }}'
413+
- admissionReviewVersions:
414+
- v1
415+
clientConfig:
416+
service:
417+
name: '{{ include "kueue.fullname" . }}-webhook-service'
418+
namespace: '{{ .Release.Namespace }}'
419+
path: /mutate-trainer-kubeflow-org-v1alpha1-trainjob
420+
failurePolicy: Fail
421+
name: mtrainjob.kb.io
422+
rules:
423+
- apiGroups:
424+
- trainer.kubeflow.org
425+
apiVersions:
426+
- v1alpha1
427+
operations:
428+
- CREATE
429+
resources:
430+
- trainjobs
431+
sideEffects: None
432+
reinvocationPolicy: '{{ .Values.mutatingWebhook.reinvocationPolicy }}'
433433
- admissionReviewVersions:
434434
- v1
435435
clientConfig:
@@ -730,26 +730,6 @@ webhooks:
730730
resources:
731731
- xgboostjobs
732732
sideEffects: None
733-
- admissionReviewVersions:
734-
- v1
735-
clientConfig:
736-
service:
737-
name: '{{ include "kueue.fullname" . }}-webhook-service'
738-
namespace: '{{ .Release.Namespace }}'
739-
path: /validate-trainer-kubeflow-org-v1alpha1-trainjob
740-
failurePolicy: Fail
741-
name: vtrainjob.kb.io
742-
rules:
743-
- apiGroups:
744-
- trainer.kubeflow.org
745-
apiVersions:
746-
- v1alpha1
747-
operations:
748-
- CREATE
749-
- UPDATE
750-
resources:
751-
- trainjobs
752-
sideEffects: None
753733
- admissionReviewVersions:
754734
- v1
755735
clientConfig:
@@ -916,6 +896,26 @@ webhooks:
916896
resources:
917897
- statefulsets
918898
sideEffects: None
899+
- admissionReviewVersions:
900+
- v1
901+
clientConfig:
902+
service:
903+
name: '{{ include "kueue.fullname" . }}-webhook-service'
904+
namespace: '{{ .Release.Namespace }}'
905+
path: /validate-trainer-kubeflow-org-v1alpha1-trainjob
906+
failurePolicy: Fail
907+
name: vtrainjob.kb.io
908+
rules:
909+
- apiGroups:
910+
- trainer.kubeflow.org
911+
apiVersions:
912+
- v1alpha1
913+
operations:
914+
- CREATE
915+
- UPDATE
916+
resources:
917+
- trainjobs
918+
sideEffects: None
919919
- admissionReviewVersions:
920920
- v1
921921
clientConfig:

config/components/webhook/manifests.yaml

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -176,25 +176,6 @@ webhooks:
176176
resources:
177177
- xgboostjobs
178178
sideEffects: None
179-
- admissionReviewVersions:
180-
- v1
181-
clientConfig:
182-
service:
183-
name: webhook-service
184-
namespace: system
185-
path: /mutate-trainer-kubeflow-org-v1alpha1-trainjob
186-
failurePolicy: Fail
187-
name: mtrainjob.kb.io
188-
rules:
189-
- apiGroups:
190-
- trainer.kubeflow.org
191-
apiVersions:
192-
- v1alpha1
193-
operations:
194-
- CREATE
195-
resources:
196-
- trainjobs
197-
sideEffects: None
198179
- admissionReviewVersions:
199180
- v1
200181
clientConfig:
@@ -311,6 +292,25 @@ webhooks:
311292
resources:
312293
- statefulsets
313294
sideEffects: None
295+
- admissionReviewVersions:
296+
- v1
297+
clientConfig:
298+
service:
299+
name: webhook-service
300+
namespace: system
301+
path: /mutate-trainer-kubeflow-org-v1alpha1-trainjob
302+
failurePolicy: Fail
303+
name: mtrainjob.kb.io
304+
rules:
305+
- apiGroups:
306+
- trainer.kubeflow.org
307+
apiVersions:
308+
- v1alpha1
309+
operations:
310+
- CREATE
311+
resources:
312+
- trainjobs
313+
sideEffects: None
314314
- admissionReviewVersions:
315315
- v1
316316
clientConfig:
@@ -554,26 +554,6 @@ webhooks:
554554
resources:
555555
- xgboostjobs
556556
sideEffects: None
557-
- admissionReviewVersions:
558-
- v1
559-
clientConfig:
560-
service:
561-
name: webhook-service
562-
namespace: system
563-
path: /validate-trainer-kubeflow-org-v1alpha1-trainjob
564-
failurePolicy: Fail
565-
name: vtrainjob.kb.io
566-
rules:
567-
- apiGroups:
568-
- trainer.kubeflow.org
569-
apiVersions:
570-
- v1alpha1
571-
operations:
572-
- CREATE
573-
- UPDATE
574-
resources:
575-
- trainjobs
576-
sideEffects: None
577557
- admissionReviewVersions:
578558
- v1
579559
clientConfig:
@@ -694,6 +674,26 @@ webhooks:
694674
resources:
695675
- statefulsets
696676
sideEffects: None
677+
- admissionReviewVersions:
678+
- v1
679+
clientConfig:
680+
service:
681+
name: webhook-service
682+
namespace: system
683+
path: /validate-trainer-kubeflow-org-v1alpha1-trainjob
684+
failurePolicy: Fail
685+
name: vtrainjob.kb.io
686+
rules:
687+
- apiGroups:
688+
- trainer.kubeflow.org
689+
apiVersions:
690+
- v1alpha1
691+
operations:
692+
- CREATE
693+
- UPDATE
694+
resources:
695+
- trainjobs
696+
sideEffects: None
697697
- admissionReviewVersions:
698698
- v1
699699
clientConfig:

pkg/controller/jobs/jobs.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ import (
2323
_ "sigs.k8s.io/kueue/pkg/controller/jobs/job"
2424
_ "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
2525
_ "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs"
26-
_ "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/trainjob"
2726
_ "sigs.k8s.io/kueue/pkg/controller/jobs/leaderworkerset"
2827
_ "sigs.k8s.io/kueue/pkg/controller/jobs/mpijob"
2928
_ "sigs.k8s.io/kueue/pkg/controller/jobs/pod"
3029
_ "sigs.k8s.io/kueue/pkg/controller/jobs/raycluster"
3130
_ "sigs.k8s.io/kueue/pkg/controller/jobs/rayjob"
3231
_ "sigs.k8s.io/kueue/pkg/controller/jobs/statefulset"
32+
_ "sigs.k8s.io/kueue/pkg/controller/jobs/trainjob"
3333
)

pkg/controller/jobs/kubeflow/trainjob/trainjob_controller.go renamed to pkg/controller/jobs/trainjob/trainjob_controller.go

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import (
2323
"fmt"
2424
"strconv"
2525

26-
kftrainerapi "github.com/kubeflow/trainer/v2/pkg/apis/trainer/v1alpha1"
26+
kftrainer "github.com/kubeflow/trainer/v2/pkg/apis/trainer/v1alpha1"
2727
kftrainerruntime "github.com/kubeflow/trainer/v2/pkg/runtime"
2828
kftrainerruntimecore "github.com/kubeflow/trainer/v2/pkg/runtime/core"
2929
kftrainerjobset "github.com/kubeflow/trainer/v2/pkg/runtime/framework/plugins/jobset"
@@ -42,15 +42,15 @@ import (
4242

4343
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
4444
"sigs.k8s.io/kueue/pkg/controller/jobframework"
45-
kJobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
45+
workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
4646
"sigs.k8s.io/kueue/pkg/features"
4747
"sigs.k8s.io/kueue/pkg/podset"
4848
clientutil "sigs.k8s.io/kueue/pkg/util/client"
4949
"sigs.k8s.io/kueue/pkg/util/slices"
5050
)
5151

5252
var (
53-
gvk = kftrainerapi.GroupVersion.WithKind("TrainJob")
53+
gvk = kftrainer.GroupVersion.WithKind("TrainJob")
5454
FrameworkName = "trainer.kubeflow.org/trainjob"
5555
TrainJobControllerName = "trainer.kubeflow.org/trainjob-controller"
5656
)
@@ -66,8 +66,8 @@ func init() {
6666
NewJob: NewJob,
6767
NewReconciler: NewReconciler,
6868
SetupWebhook: SetupTrainJobWebhook,
69-
JobType: &kftrainerapi.TrainJob{},
70-
AddToScheme: kftrainerapi.AddToScheme,
69+
JobType: &kftrainer.TrainJob{},
70+
AddToScheme: kftrainer.AddToScheme,
7171
MultiKueueAdapter: &multiKueueAdapter{},
7272
}))
7373
}
@@ -108,11 +108,11 @@ func (r *trainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
108108

109109
func (r *trainJobReconciler) SetupWithManager(mgr ctrl.Manager) error {
110110
b := ctrl.NewControllerManagedBy(mgr).
111-
For(&kftrainerapi.TrainJob{}).Owns(&kueue.Workload{}).Owns(&jobsetapi.JobSet{})
111+
For(&kftrainer.TrainJob{}).Owns(&kueue.Workload{}).Owns(&jobsetapi.JobSet{})
112112
return b.Complete(r)
113113
}
114114

115-
type TrainJob kftrainerapi.TrainJob
115+
type TrainJob kftrainer.TrainJob
116116

117117
var _ jobframework.GenericJob = (*TrainJob)(nil)
118118
var _ jobframework.JobWithCustomStop = (*TrainJob)(nil)
@@ -124,11 +124,11 @@ func NewJob() jobframework.GenericJob {
124124
}
125125

126126
func fromObject(obj runtime.Object) *TrainJob {
127-
return (*TrainJob)(obj.(*kftrainerapi.TrainJob))
127+
return (*TrainJob)(obj.(*kftrainer.TrainJob))
128128
}
129129

130130
func (t *TrainJob) Object() client.Object {
131-
return (*kftrainerapi.TrainJob)(t)
131+
return (*kftrainer.TrainJob)(t)
132132
}
133133

134134
func (t *TrainJob) IsSuspended() bool {
@@ -167,7 +167,7 @@ func getChildJobSet(t *TrainJob) (*jobsetapi.JobSet, error) {
167167
return nil, fmt.Errorf("unsupported runtime: %s", runtimeRefGK)
168168
}
169169

170-
trainJob := (*kftrainerapi.TrainJob)(t)
170+
trainJob := (*kftrainer.TrainJob)(t)
171171
trSpec, err := getRuntimeSpec(trainJob)
172172
if err != nil {
173173
return nil, fmt.Errorf("runtime '%s' not found", trainJob.Spec.RuntimeRef.Name)
@@ -212,16 +212,16 @@ func jobsetApplyToJobset(jobsetApply *jobsetapplyapi.JobSetApplyConfiguration) (
212212
return jobset, nil
213213
}
214214

215-
func getRuntimeSpec(trainJob *kftrainerapi.TrainJob) (*kftrainerapi.TrainingRuntimeSpec, error) {
216-
if *trainJob.Spec.RuntimeRef.Kind == kftrainerapi.ClusterTrainingRuntimeKind {
217-
var ctr kftrainerapi.ClusterTrainingRuntime
215+
func getRuntimeSpec(trainJob *kftrainer.TrainJob) (*kftrainer.TrainingRuntimeSpec, error) {
216+
if *trainJob.Spec.RuntimeRef.Kind == kftrainer.ClusterTrainingRuntimeKind {
217+
var ctr kftrainer.ClusterTrainingRuntime
218218
err := reconciler.client.Get(reconciler.ctx, client.ObjectKey{Name: trainJob.Spec.RuntimeRef.Name}, &ctr)
219219
if err != nil {
220220
return nil, err
221221
}
222222
return &ctr.Spec, nil
223223
} else {
224-
var tr kftrainerapi.TrainingRuntime
224+
var tr kftrainer.TrainingRuntime
225225
err := reconciler.client.Get(reconciler.ctx, client.ObjectKey{Namespace: trainJob.Namespace, Name: trainJob.Spec.RuntimeRef.Name}, &tr)
226226
if err != nil {
227227
return nil, err
@@ -235,7 +235,7 @@ func (t *TrainJob) PodSets() ([]kueue.PodSet, error) {
235235
if err != nil {
236236
return nil, err
237237
}
238-
return (*kJobset.JobSet)(jobset).PodSets()
238+
return (*workloadjobset.JobSet)(jobset).PodSets()
239239
}
240240

241241
func (t *TrainJob) RunWithPodSetsInfo(podSetsInfo []podset.PodSetInfo) error {
@@ -249,16 +249,16 @@ func (t *TrainJob) RunWithPodSetsInfo(podSetsInfo []podset.PodSetInfo) error {
249249
}
250250

251251
if t.Spec.PodSpecOverrides == nil {
252-
t.Spec.PodSpecOverrides = []kftrainerapi.PodSpecOverride{}
252+
t.Spec.PodSpecOverrides = []kftrainer.PodSpecOverride{}
253253
}
254254
if t.Annotations == nil {
255255
t.Annotations = map[string]string{}
256256
}
257257
t.Annotations[firstOverrideIdx] = strconv.Itoa(len(t.Spec.PodSpecOverrides))
258258
for _, info := range podSetsInfo {
259259
// The trainjob controller merges each podSpecOverride sequentially, so any existing user provided override will be processed first
260-
t.Spec.PodSpecOverrides = append(t.Spec.PodSpecOverrides, kftrainerapi.PodSpecOverride{
261-
TargetJobs: []kftrainerapi.PodSpecOverrideTargetJob{
260+
t.Spec.PodSpecOverrides = append(t.Spec.PodSpecOverrides, kftrainer.PodSpecOverride{
261+
TargetJobs: []kftrainer.PodSpecOverrideTargetJob{
262262
{Name: string(info.Name)},
263263
},
264264
// TODO: Set the labels/annotations when supported. See https://github.com/kubeflow/trainer/pull/2785
@@ -320,10 +320,10 @@ func (t *TrainJob) RestorePodSetsInfo(_ []podset.PodSetInfo) bool {
320320
}
321321

322322
func (t *TrainJob) Finished() (message string, success, finished bool) {
323-
if c := apimeta.FindStatusCondition(t.Status.Conditions, kftrainerapi.TrainJobComplete); c != nil && c.Status == metav1.ConditionTrue {
323+
if c := apimeta.FindStatusCondition(t.Status.Conditions, kftrainer.TrainJobComplete); c != nil && c.Status == metav1.ConditionTrue {
324324
return c.Message, true, true
325325
}
326-
if c := apimeta.FindStatusCondition(t.Status.Conditions, kftrainerapi.TrainJobFailed); c != nil && c.Status == metav1.ConditionTrue {
326+
if c := apimeta.FindStatusCondition(t.Status.Conditions, kftrainer.TrainJobFailed); c != nil && c.Status == metav1.ConditionTrue {
327327
return c.Message, false, true
328328
}
329329
return message, success, false
@@ -356,15 +356,15 @@ func (t *TrainJob) ReclaimablePods() ([]kueue.ReclaimablePod, error) {
356356
}
357357

358358
ret := make([]kueue.ReclaimablePod, 0, len(jobset.Spec.ReplicatedJobs))
359-
statuses := slices.ToRefMap(t.Status.JobsStatus, func(js *kftrainerapi.JobStatus) string { return js.Name })
359+
statuses := slices.ToRefMap(t.Status.JobsStatus, func(js *kftrainer.JobStatus) string { return js.Name })
360360

361361
for i := range jobset.Spec.ReplicatedJobs {
362362
spec := &jobset.Spec.ReplicatedJobs[i]
363363
if status, found := statuses[spec.Name]; found && status.Succeeded > 0 {
364364
if status.Succeeded > 0 && status.Succeeded <= spec.Replicas {
365365
ret = append(ret, kueue.ReclaimablePod{
366366
Name: kueue.NewPodSetReference(spec.Name),
367-
Count: status.Succeeded * kJobset.PodsCountPerReplica(spec),
367+
Count: status.Succeeded * workloadjobset.PodsCountPerReplica(spec),
368368
})
369369
}
370370
}

pkg/controller/jobs/kubeflow/trainjob/trainjob_controller_test.go renamed to pkg/controller/jobs/trainjob/trainjob_controller_test.go

File renamed without changes.

pkg/controller/jobs/kubeflow/trainjob/trainjob_multikueue_adapter.go renamed to pkg/controller/jobs/trainjob/trainjob_multikueue_adapter.go

File renamed without changes.

pkg/controller/jobs/kubeflow/trainjob/trainjob_multikueue_adapter_test.go renamed to pkg/controller/jobs/trainjob/trainjob_multikueue_adapter_test.go

File renamed without changes.
File renamed without changes.

pkg/controller/jobs/kubeflow/trainjob/trainjob_webhook_test.go renamed to pkg/controller/jobs/trainjob/trainjob_webhook_test.go

File renamed without changes.

test/e2e/multikueue/e2e_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@ import (
4646
workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job"
4747
workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
4848
workloadpytorchjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/pytorchjob"
49-
workloadtrainjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/trainjob"
5049
workloadmpijob "sigs.k8s.io/kueue/pkg/controller/jobs/mpijob"
5150
workloadpod "sigs.k8s.io/kueue/pkg/controller/jobs/pod"
5251
podconstants "sigs.k8s.io/kueue/pkg/controller/jobs/pod/constants"
5352
workloadraycluster "sigs.k8s.io/kueue/pkg/controller/jobs/raycluster"
5453
workloadrayjob "sigs.k8s.io/kueue/pkg/controller/jobs/rayjob"
54+
workloadtrainjob "sigs.k8s.io/kueue/pkg/controller/jobs/trainjob"
5555
"sigs.k8s.io/kueue/pkg/util/admissioncheck"
5656
utilpod "sigs.k8s.io/kueue/pkg/util/pod"
5757
utiltesting "sigs.k8s.io/kueue/pkg/util/testing"

0 commit comments

Comments
 (0)