@@ -280,6 +280,32 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
280280 }
281281 }
282282
283+ // Init the JobSet apply configuration from the runtime template spec.
284+ // The full build must happen before the spec comparison below (RHOAIENG-48867)
285+ // so that all builder mutations (Initializer, Trainer, PodLabels, PodAnnotations)
286+ // are reflected in the desired state used for the comparison.
287+ jobSetBuilder := NewBuilder (jobsetv1alpha2ac .JobSet (trainJob .Name , trainJob .Namespace ).
288+ WithLabels (maps .Clone (info .Labels )).
289+ WithAnnotations (maps .Clone (info .Annotations )).
290+ WithSpec (jobSetSpec ))
291+
292+ // TODO (andreyvelich): Refactor the builder with wrappers for PodSpec.
293+ // TODO: Once we remove deprecated runtime.Info.Trainer, we should remove JobSet Builder with DeprecatedTrainer().
294+ jobSet := jobSetBuilder .
295+ Initializer (trainJob ).
296+ Trainer (info , trainJob ).
297+ PodLabels (info .Scheduler .PodLabels ).
298+ PodAnnotations (info .Scheduler .PodAnnotations ).
299+ Suspend (trainJob .Spec .Suspend ).
300+ Build ().
301+ WithOwnerReferences (metav1ac .OwnerReference ().
302+ WithAPIVersion (trainer .GroupVersion .String ()).
303+ WithKind (trainer .TrainJobKind ).
304+ WithName (trainJob .Name ).
305+ WithUID (trainJob .UID ).
306+ WithController (true ).
307+ WithBlockOwnerDeletion (true ))
308+
283309 // RHOAIENG-48867: If the existing JobSet is suspended and the TrainJob has been
284310 // resumed (Kueue admitted it, Suspend=false) but spec.replicatedJobs has changed
285311 // (e.g., container images updated during a ClusterTrainingRuntime upgrade), delete
@@ -292,18 +318,13 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
292318 // while suspended). Delete-recreate is only needed when the TrainJob transitions
293319 // from suspended to running and the runtime spec has changed.
294320 //
295- // trainerImage corrects the node container comparison: jobSetSpec holds the raw
296- // runtime template image for the node container at this point (the builder applies
297- // TrainJob.Spec.Trainer.Image override later). Without this, every normal admission
298- // would incorrectly detect a node image change and trigger an unnecessary deletion.
299- var trainerImage * string
300- if trainJob .Spec .Trainer != nil {
301- trainerImage = trainJob .Spec .Trainer .Image
302- }
321+ // The comparison uses the fully-built desired JobSet spec so that all builder
322+ // mutations (Initializer, Trainer, PodLabels, PodAnnotations) are accounted for,
323+ // covering container and initContainer images, added and removed containers.
303324 if oldJobSet != nil &&
304325 ptr .Deref (oldJobSet .Spec .Suspend , false ) &&
305326 ! ptr .Deref (trainJob .Spec .Suspend , false ) &&
306- replicatedJobsSpecChanged (oldJobSet .Spec .ReplicatedJobs , jobSetSpec . ReplicatedJobs , trainerImage ) {
327+ replicatedJobsSpecChanged (oldJobSet .Spec .ReplicatedJobs , jobSet . Spec . ReplicatedJobs ) {
307328 j .logger .Info ("Deleting stale suspended JobSet: spec.replicatedJobs changed post-upgrade, will recreate" ,
308329 "jobSet" , client .ObjectKeyFromObject (trainJob ))
309330 // Use background propagation: the JobSet is suspended so there are no
@@ -318,29 +339,6 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
318339 return nil , nil
319340 }
320341
321- // Init the JobSet apply configuration from the runtime template spec
322- jobSetBuilder := NewBuilder (jobsetv1alpha2ac .JobSet (trainJob .Name , trainJob .Namespace ).
323- WithLabels (maps .Clone (info .Labels )).
324- WithAnnotations (maps .Clone (info .Annotations )).
325- WithSpec (jobSetSpec ))
326-
327- // TODO (andreyvelich): Refactor the builder with wrappers for PodSpec.
328- // TODO: Once we remove deprecated runtime.Info.Trainer, we should remove JobSet Builder with DeprecatedTrainer().
329- jobSet := jobSetBuilder .
330- Initializer (trainJob ).
331- Trainer (info , trainJob ).
332- PodLabels (info .Scheduler .PodLabels ).
333- PodAnnotations (info .Scheduler .PodAnnotations ).
334- Suspend (trainJob .Spec .Suspend ).
335- Build ().
336- WithOwnerReferences (metav1ac .OwnerReference ().
337- WithAPIVersion (trainer .GroupVersion .String ()).
338- WithKind (trainer .TrainJobKind ).
339- WithName (trainJob .Name ).
340- WithUID (trainJob .UID ).
341- WithController (true ).
342- WithBlockOwnerDeletion (true ))
343-
344342 return []apiruntime.ApplyConfiguration {jobSet }, nil
345343}
346344
@@ -350,15 +348,12 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
350348// scenario where a ClusterTrainingRuntime change updates the runtime container image.
351349// RHOAIENG-48867
352350//
353- // trainerImage is the TrainJob.Spec.Trainer.Image override (may be nil). The
354- // builder applies this override to the node container after this function is
355- // called, so the raw jobSetSpec still holds the runtime template image for that
356- // container. Passing trainerImage here ensures we compare the effective image
357- // that will end up in the JobSet, avoiding false-positive deletions.
351+ // desired must be the fully-built desired spec — all builder mutations (Initializer,
352+ // Trainer, PodLabels, PodAnnotations) must be applied before calling this function
353+ // so that the comparison reflects the effective state that will be applied to the cluster.
358354func replicatedJobsSpecChanged (
359355 existing []jobsetv1alpha2.ReplicatedJob ,
360356 desired []jobsetv1alpha2ac.ReplicatedJobApplyConfiguration ,
361- trainerImage * string ,
362357) bool {
363358 if len (existing ) != len (desired ) {
364359 return true
@@ -380,23 +375,48 @@ func replicatedJobsSpecChanged(
380375 d .Template .Spec .Template == nil || d .Template .Spec .Template .Spec == nil {
381376 continue
382377 }
383- existingImages := make (map [string ]string , len (e .Template .Spec .Template .Spec .Containers ))
384- for _ , c := range e .Template .Spec .Template .Spec .Containers {
378+ dSpec := d .Template .Spec .Template .Spec
379+ eSpec := & e .Template .Spec .Template .Spec
380+
381+ // Check containers: detect changed/added images (desired → existing direction).
382+ existingImages := make (map [string ]string , len (eSpec .Containers ))
383+ for _ , c := range eSpec .Containers {
385384 existingImages [c .Name ] = c .Image
386385 }
387- for _ , c := range d .Template .Spec .Template .Spec .Containers {
386+ desiredContainerNames := sets .New [string ]()
387+ for _ , c := range dSpec .Containers {
388+ if c .Name == nil || c .Image == nil {
389+ continue
390+ }
391+ desiredContainerNames .Insert (* c .Name )
392+ if img , found := existingImages [* c .Name ]; ! found || img != * c .Image {
393+ return true
394+ }
395+ }
396+ // Detect containers removed from desired (existing → desired direction).
397+ for _ , c := range eSpec .Containers {
398+ if ! desiredContainerNames .Has (c .Name ) {
399+ return true
400+ }
401+ }
402+
403+ // Repeat the same checks for initContainers.
404+ existingInitImages := make (map [string ]string , len (eSpec .InitContainers ))
405+ for _ , c := range eSpec .InitContainers {
406+ existingInitImages [c .Name ] = c .Image
407+ }
408+ desiredInitNames := sets .New [string ]()
409+ for _ , c := range dSpec .InitContainers {
388410 if c .Name == nil || c .Image == nil {
389411 continue
390412 }
391- // For the node (trainer) container, use the TrainJob's trainer image
392- // override as the effective desired image. The builder applies this
393- // override after replicatedJobsSpecChanged is called, so jobSetSpec
394- // still has the raw runtime template image at this point.
395- effectiveImage := c .Image
396- if * c .Name == constants .Node && trainerImage != nil {
397- effectiveImage = trainerImage
413+ desiredInitNames .Insert (* c .Name )
414+ if img , found := existingInitImages [* c .Name ]; ! found || img != * c .Image {
415+ return true
398416 }
399- if img , found := existingImages [* c .Name ]; ! found || img != * effectiveImage {
417+ }
418+ for _ , c := range eSpec .InitContainers {
419+ if ! desiredInitNames .Has (c .Name ) {
400420 return true
401421 }
402422 }
0 commit comments