Skip to content

Commit 248b68c

Browse files
Ambient Code Botclaude
andcommitted
fix(jobset): address CodeRabbit review: compare fully-built spec and improve test assertions
- Move RHOAIENG-48867 delete guard to after the full JobSet build so that all builder mutations (Initializer, Trainer, PodLabels, PodAnnotations) are reflected in the comparison; this replaces the trainerImage workaround - Expand replicatedJobsSpecChanged() to detect removed containers and initContainer image changes (bi-directional comparison for both lists) - Assert DeletePropagationBackground in unit test using client interceptor - Add Eventually wait in integration test AfterEach to ensure resources are fully removed before the next test runs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9c4571a commit 248b68c

3 files changed

Lines changed: 114 additions & 59 deletions

File tree

pkg/runtime/framework/plugins/jobset/jobset.go

Lines changed: 69 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,32 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
280280
}
281281
}
282282

283+
// Init the JobSet apply configuration from the runtime template spec.
284+
// The full build must happen before the spec comparison below (RHOAIENG-48867)
285+
// so that all builder mutations (Initializer, Trainer, PodLabels, PodAnnotations)
286+
// are reflected in the desired state used for the comparison.
287+
jobSetBuilder := NewBuilder(jobsetv1alpha2ac.JobSet(trainJob.Name, trainJob.Namespace).
288+
WithLabels(maps.Clone(info.Labels)).
289+
WithAnnotations(maps.Clone(info.Annotations)).
290+
WithSpec(jobSetSpec))
291+
292+
// TODO (andreyvelich): Refactor the builder with wrappers for PodSpec.
293+
// TODO: Once we remove deprecated runtime.Info.Trainer, we should remove JobSet Builder with DeprecatedTrainer().
294+
jobSet := jobSetBuilder.
295+
Initializer(trainJob).
296+
Trainer(info, trainJob).
297+
PodLabels(info.Scheduler.PodLabels).
298+
PodAnnotations(info.Scheduler.PodAnnotations).
299+
Suspend(trainJob.Spec.Suspend).
300+
Build().
301+
WithOwnerReferences(metav1ac.OwnerReference().
302+
WithAPIVersion(trainer.GroupVersion.String()).
303+
WithKind(trainer.TrainJobKind).
304+
WithName(trainJob.Name).
305+
WithUID(trainJob.UID).
306+
WithController(true).
307+
WithBlockOwnerDeletion(true))
308+
283309
// RHOAIENG-48867: If the existing JobSet is suspended and the TrainJob has been
284310
// resumed (Kueue admitted it, Suspend=false) but spec.replicatedJobs has changed
285311
// (e.g., container images updated during a ClusterTrainingRuntime upgrade), delete
@@ -292,18 +318,13 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
292318
// while suspended). Delete-recreate is only needed when the TrainJob transitions
293319
// from suspended to running and the runtime spec has changed.
294320
//
295-
// trainerImage corrects the node container comparison: jobSetSpec holds the raw
296-
// runtime template image for the node container at this point (the builder applies
297-
// TrainJob.Spec.Trainer.Image override later). Without this, every normal admission
298-
// would incorrectly detect a node image change and trigger an unnecessary deletion.
299-
var trainerImage *string
300-
if trainJob.Spec.Trainer != nil {
301-
trainerImage = trainJob.Spec.Trainer.Image
302-
}
321+
// The comparison uses the fully-built desired JobSet spec so that all builder
322+
// mutations (Initializer, Trainer, PodLabels, PodAnnotations) are accounted for,
323+
// covering container and initContainer images, added and removed containers.
303324
if oldJobSet != nil &&
304325
ptr.Deref(oldJobSet.Spec.Suspend, false) &&
305326
!ptr.Deref(trainJob.Spec.Suspend, false) &&
306-
replicatedJobsSpecChanged(oldJobSet.Spec.ReplicatedJobs, jobSetSpec.ReplicatedJobs, trainerImage) {
327+
replicatedJobsSpecChanged(oldJobSet.Spec.ReplicatedJobs, jobSet.Spec.ReplicatedJobs) {
307328
j.logger.Info("Deleting stale suspended JobSet: spec.replicatedJobs changed post-upgrade, will recreate",
308329
"jobSet", client.ObjectKeyFromObject(trainJob))
309330
// Use background propagation: the JobSet is suspended so there are no
@@ -318,29 +339,6 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
318339
return nil, nil
319340
}
320341

321-
// Init the JobSet apply configuration from the runtime template spec
322-
jobSetBuilder := NewBuilder(jobsetv1alpha2ac.JobSet(trainJob.Name, trainJob.Namespace).
323-
WithLabels(maps.Clone(info.Labels)).
324-
WithAnnotations(maps.Clone(info.Annotations)).
325-
WithSpec(jobSetSpec))
326-
327-
// TODO (andreyvelich): Refactor the builder with wrappers for PodSpec.
328-
// TODO: Once we remove deprecated runtime.Info.Trainer, we should remove JobSet Builder with DeprecatedTrainer().
329-
jobSet := jobSetBuilder.
330-
Initializer(trainJob).
331-
Trainer(info, trainJob).
332-
PodLabels(info.Scheduler.PodLabels).
333-
PodAnnotations(info.Scheduler.PodAnnotations).
334-
Suspend(trainJob.Spec.Suspend).
335-
Build().
336-
WithOwnerReferences(metav1ac.OwnerReference().
337-
WithAPIVersion(trainer.GroupVersion.String()).
338-
WithKind(trainer.TrainJobKind).
339-
WithName(trainJob.Name).
340-
WithUID(trainJob.UID).
341-
WithController(true).
342-
WithBlockOwnerDeletion(true))
343-
344342
return []apiruntime.ApplyConfiguration{jobSet}, nil
345343
}
346344

@@ -350,15 +348,12 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
350348
// scenario where a ClusterTrainingRuntime change updates the runtime container image.
351349
// RHOAIENG-48867
352350
//
353-
// trainerImage is the TrainJob.Spec.Trainer.Image override (may be nil). The
354-
// builder applies this override to the node container after this function is
355-
// called, so the raw jobSetSpec still holds the runtime template image for that
356-
// container. Passing trainerImage here ensures we compare the effective image
357-
// that will end up in the JobSet, avoiding false-positive deletions.
351+
// desired must be the fully-built desired spec — all builder mutations (Initializer,
352+
// Trainer, PodLabels, PodAnnotations) must be applied before calling this function
353+
// so that the comparison reflects the effective state that will be applied to the cluster.
358354
func replicatedJobsSpecChanged(
359355
existing []jobsetv1alpha2.ReplicatedJob,
360356
desired []jobsetv1alpha2ac.ReplicatedJobApplyConfiguration,
361-
trainerImage *string,
362357
) bool {
363358
if len(existing) != len(desired) {
364359
return true
@@ -380,23 +375,48 @@ func replicatedJobsSpecChanged(
380375
d.Template.Spec.Template == nil || d.Template.Spec.Template.Spec == nil {
381376
continue
382377
}
383-
existingImages := make(map[string]string, len(e.Template.Spec.Template.Spec.Containers))
384-
for _, c := range e.Template.Spec.Template.Spec.Containers {
378+
dSpec := d.Template.Spec.Template.Spec
379+
eSpec := &e.Template.Spec.Template.Spec
380+
381+
// Check containers: detect changed/added images (desired → existing direction).
382+
existingImages := make(map[string]string, len(eSpec.Containers))
383+
for _, c := range eSpec.Containers {
385384
existingImages[c.Name] = c.Image
386385
}
387-
for _, c := range d.Template.Spec.Template.Spec.Containers {
386+
desiredContainerNames := sets.New[string]()
387+
for _, c := range dSpec.Containers {
388+
if c.Name == nil || c.Image == nil {
389+
continue
390+
}
391+
desiredContainerNames.Insert(*c.Name)
392+
if img, found := existingImages[*c.Name]; !found || img != *c.Image {
393+
return true
394+
}
395+
}
396+
// Detect containers removed from desired (existing → desired direction).
397+
for _, c := range eSpec.Containers {
398+
if !desiredContainerNames.Has(c.Name) {
399+
return true
400+
}
401+
}
402+
403+
// Repeat the same checks for initContainers.
404+
existingInitImages := make(map[string]string, len(eSpec.InitContainers))
405+
for _, c := range eSpec.InitContainers {
406+
existingInitImages[c.Name] = c.Image
407+
}
408+
desiredInitNames := sets.New[string]()
409+
for _, c := range dSpec.InitContainers {
388410
if c.Name == nil || c.Image == nil {
389411
continue
390412
}
391-
// For the node (trainer) container, use the TrainJob's trainer image
392-
// override as the effective desired image. The builder applies this
393-
// override after replicatedJobsSpecChanged is called, so jobSetSpec
394-
// still has the raw runtime template image at this point.
395-
effectiveImage := c.Image
396-
if *c.Name == constants.Node && trainerImage != nil {
397-
effectiveImage = trainerImage
413+
desiredInitNames.Insert(*c.Name)
414+
if img, found := existingInitImages[*c.Name]; !found || img != *c.Image {
415+
return true
398416
}
399-
if img, found := existingImages[*c.Name]; !found || img != *effectiveImage {
417+
}
418+
for _, c := range eSpec.InitContainers {
419+
if !desiredInitNames.Has(c.Name) {
400420
return true
401421
}
402422
}

pkg/runtime/framework/plugins/jobset/jobset_test.go

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -122,21 +122,23 @@ func TestBuild_ImmutableJobSetUpgrade(t *testing.T) {
122122
}
123123

124124
cases := map[string]struct {
125-
trainJob *trainer.TrainJob
126-
existingJobSet *jobsetv1alpha2.JobSet
127-
info *runtime.Info
128-
wantNilResult bool // true if Build() should return no apply configs
129-
wantJobSetDeleted bool // true if the existing JobSet should be deleted
130-
wantError error
125+
trainJob *trainer.TrainJob
126+
existingJobSet *jobsetv1alpha2.JobSet
127+
info *runtime.Info
128+
wantNilResult bool // true if Build() should return no apply configs
129+
wantJobSetDeleted bool // true if the existing JobSet should be deleted
130+
wantBackgroundDeletion bool // true if the delete should use Background propagation
131+
wantError error
131132
}{
132133
"suspended JobSet with changed image is deleted on upgrade (RHOAIENG-48867)": {
133134
trainJob: utiltesting.MakeTrainJobWrapper(metav1.NamespaceDefault, "test").
134135
Suspend(false).
135136
Obj(),
136-
existingJobSet: makeSuspendedJobSet(oldImage),
137-
info: makeInfo(newImage),
138-
wantNilResult: true,
139-
wantJobSetDeleted: true,
137+
existingJobSet: makeSuspendedJobSet(oldImage),
138+
info: makeInfo(newImage),
139+
wantNilResult: true,
140+
wantJobSetDeleted: true,
141+
wantBackgroundDeletion: true,
140142
},
141143
"suspended JobSet with unchanged image is not deleted": {
142144
trainJob: utiltesting.MakeTrainJobWrapper(metav1.NamespaceDefault, "test").
@@ -191,10 +193,24 @@ func TestBuild_ImmutableJobSetUpgrade(t *testing.T) {
191193
ctx, cancel = context.WithCancel(ctx)
192194
t.Cleanup(cancel)
193195

196+
var capturedPropagation *metav1.DeletionPropagation
194197
clientBuilder := utiltesting.NewClientBuilder()
195198
if tc.existingJobSet != nil {
196199
clientBuilder = clientBuilder.WithObjects(tc.existingJobSet)
197200
}
201+
// Intercept Delete calls to capture the deletion propagation policy.
202+
clientBuilder = clientBuilder.WithInterceptorFuncs(interceptor.Funcs{
203+
Delete: func(ctx context.Context, cli client.WithWatch, obj client.Object, opts ...client.DeleteOption) error {
204+
if _, ok := obj.(*jobsetv1alpha2.JobSet); ok {
205+
deleteOpts := &client.DeleteOptions{}
206+
for _, o := range opts {
207+
o.ApplyToDelete(deleteOpts)
208+
}
209+
capturedPropagation = deleteOpts.PropagationPolicy
210+
}
211+
return cli.Delete(ctx, obj, opts...)
212+
},
213+
})
198214
cli := clientBuilder.Build()
199215

200216
p, err := New(ctx, cli, nil)
@@ -224,6 +240,14 @@ func TestBuild_ImmutableJobSetUpgrade(t *testing.T) {
224240
t.Errorf("Expected existing JobSet to remain, but it was deleted")
225241
}
226242
}
243+
// Verify deletion propagation policy for cases that expect background deletion.
244+
if tc.wantBackgroundDeletion {
245+
if capturedPropagation == nil {
246+
t.Errorf("Expected Delete to be called with Background propagation, but Delete was not called")
247+
} else if *capturedPropagation != metav1.DeletePropagationBackground {
248+
t.Errorf("Expected DeletePropagationBackground, got %v", *capturedPropagation)
249+
}
250+
}
227251
})
228252
}
229253
}

test/integration/controller/trainjob_controller_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1455,6 +1455,17 @@ alpha-node-0-1.alpha slots=8
14551455
ginkgo.AfterEach(func() {
14561456
gomega.Expect(k8sClient.DeleteAllOf(ctx, &trainer.TrainJob{}, client.InNamespace(ns.Name))).Should(gomega.Succeed())
14571457
gomega.Expect(k8sClient.DeleteAllOf(ctx, &trainer.ClusterTrainingRuntime{})).Should(gomega.Succeed())
1458+
// Wait for resources to be fully removed to avoid name collisions in subsequent tests.
1459+
gomega.Eventually(func(g gomega.Gomega) {
1460+
trainJobList := &trainer.TrainJobList{}
1461+
g.Expect(k8sClient.List(ctx, trainJobList, client.InNamespace(ns.Name))).Should(gomega.Succeed())
1462+
g.Expect(trainJobList.Items).Should(gomega.BeEmpty())
1463+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
1464+
gomega.Eventually(func(g gomega.Gomega) {
1465+
runtimeList := &trainer.ClusterTrainingRuntimeList{}
1466+
g.Expect(k8sClient.List(ctx, runtimeList)).Should(gomega.Succeed())
1467+
g.Expect(runtimeList.Items).Should(gomega.BeEmpty())
1468+
}, util.Timeout, util.Interval).Should(gomega.Succeed())
14581469
})
14591470

14601471
ginkgo.BeforeEach(func() {

0 commit comments

Comments
 (0)