Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions pkg/runtime/framework/plugins/jobset/jobset.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,29 @@ func (j *JobSet) Build(ctx context.Context, info *runtime.Info, trainJob *traine
WithController(true).
WithBlockOwnerDeletion(true))

// Container images in spec.replicatedJobs are immutable in JobSet.
// When resuming a suspended JobSet, omit images from the apply config so SSA
// does not attempt to update them. The job resumes with the image it was created with.
if oldJobSet != nil &&
ptr.Deref(oldJobSet.Spec.Suspend, false) &&
!ptr.Deref(trainJob.Spec.Suspend, false) {
for i := range jobSet.Spec.ReplicatedJobs {
if jobSet.Spec.ReplicatedJobs[i].Template == nil ||
jobSet.Spec.ReplicatedJobs[i].Template.Spec == nil ||
jobSet.Spec.ReplicatedJobs[i].Template.Spec.Template == nil ||
jobSet.Spec.ReplicatedJobs[i].Template.Spec.Template.Spec == nil {
continue
}
podSpec := jobSet.Spec.ReplicatedJobs[i].Template.Spec.Template.Spec
for j := range podSpec.Containers {
podSpec.Containers[j].Image = nil
}
for j := range podSpec.InitContainers {
podSpec.InitContainers[j].Image = nil
Comment on lines +339 to +343
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inner loops use for j := range ... which shadows the method receiver j and makes this block harder to read/grep; rename the loop indices (e.g., cIdx, icIdx) to avoid shadowing.

Suggested change
for j := range podSpec.Containers {
podSpec.Containers[j].Image = nil
}
for j := range podSpec.InitContainers {
podSpec.InitContainers[j].Image = nil
for cIdx := range podSpec.Containers {
podSpec.Containers[cIdx].Image = nil
}
for icIdx := range podSpec.InitContainers {
podSpec.InitContainers[icIdx].Image = nil

Copilot uses AI. Check for mistakes.
}
}
}

return []apiruntime.ApplyConfiguration{jobSet}, nil
}

Expand Down
150 changes: 150 additions & 0 deletions pkg/runtime/framework/plugins/jobset/jobset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1586,3 +1586,153 @@ func TestValidate(t *testing.T) {
})
}
}

// TestBuild_ResumeOmitsContainerImages verifies that when a suspended JobSet is
// resumed, container images are omitted from the SSA apply config so that the
// immutable spec.replicatedJobs field is not updated. The job resumes with the
// image it was originally created with.
func TestBuild_ResumeOmitsContainerImages(t *testing.T) {
const upgradedImage = "registry.example.com/trainer:v2"

cases := map[string]struct {
existingJobSet *jobsetv1alpha2.JobSet
trainJobSuspend bool
wantNilImages bool
wantNilResult bool
}{
"suspended JobSet being resumed: images must be nil in apply config": {
existingJobSet: &jobsetv1alpha2.JobSet{
ObjectMeta: metav1.ObjectMeta{Name: "trainjob", Namespace: metav1.NamespaceDefault},
Spec: jobsetv1alpha2.JobSetSpec{Suspend: ptr.To(true)},
},
trainJobSuspend: false,
wantNilImages: true,
},
"no existing JobSet: images present in apply config (creation path)": {
existingJobSet: nil,
trainJobSuspend: false,
wantNilImages: false,
},
"both running: existing guard fires, nil result": {
existingJobSet: &jobsetv1alpha2.JobSet{
ObjectMeta: metav1.ObjectMeta{Name: "trainjob", Namespace: metav1.NamespaceDefault},
Spec: jobsetv1alpha2.JobSetSpec{Suspend: ptr.To(false)},
},
trainJobSuspend: false,
wantNilResult: true,
},
"TrainJob suspended: images present (not a resume)": {
existingJobSet: &jobsetv1alpha2.JobSet{
ObjectMeta: metav1.ObjectMeta{Name: "trainjob", Namespace: metav1.NamespaceDefault},
Spec: jobsetv1alpha2.JobSetSpec{Suspend: ptr.To(true)},
},
trainJobSuspend: true,
wantNilImages: false,
},
}

for name, tc := range cases {
t.Run(name, func(t *testing.T) {
_, ctx := ktesting.NewTestContext(t)
clientBuilder := utiltesting.NewClientBuilder()
if tc.existingJobSet != nil {
clientBuilder = clientBuilder.WithObjects(tc.existingJobSet)
}
cli := clientBuilder.Build()

p, err := New(ctx, cli, nil, nil)
if err != nil {
t.Fatalf("Failed to initialize JobSet plugin: %v", err)
}

trainJob := utiltesting.MakeTrainJobWrapper(metav1.NamespaceDefault, "trainjob").
Suspend(tc.trainJobSuspend).
RuntimeRef(trainer.GroupVersion.WithKind(trainer.ClusterTrainingRuntimeKind), "runtime").
Trainer(utiltesting.MakeTrainJobTrainerWrapper().
Container(upgradedImage, nil, nil, corev1.ResourceList{}).
Obj()).
Obj()

info := &runtime.Info{
Labels: map[string]string{},
Annotations: map[string]string{},
Scheduler: &runtime.Scheduler{},
TemplateSpec: runtime.TemplateSpec{
PodSets: []runtime.PodSet{
{
Name: constants.Node,
Containers: []runtime.Container{{Name: constants.Node, Image: upgradedImage}},
},
},
ObjApply: jobsetv1alpha2ac.JobSetSpec().
WithReplicatedJobs(
jobsetv1alpha2ac.ReplicatedJob().
WithName(constants.Node).
WithTemplate(batchv1ac.JobTemplateSpec().
WithSpec(batchv1ac.JobSpec().
WithTemplate(corev1ac.PodTemplateSpec().
WithSpec(corev1ac.PodSpec().
WithContainers(
corev1ac.Container().
WithName(constants.Node).
WithImage(upgradedImage),
),
),
Comment on lines +1673 to +1680
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test intends to verify both containers and initContainers have images omitted on resume, but the apply template only defines WithContainers(...) and never sets any InitContainers, so the initContainer assertions never exercise the new behavior; add an initContainer with an image to the template (and possibly the PodSets) so the resume/non-resume cases actually validate initContainer image handling.

Copilot generated this review using guidance from repository custom instructions.
),
),
),
),
},
}

objs, err := p.(framework.ComponentBuilderPlugin).Build(ctx, info, trainJob)
if err != nil {
t.Fatalf("Unexpected error from Build: %v", err)
}

if tc.wantNilResult {
if objs != nil {
t.Errorf("Expected nil result (running guard), got %v", objs)
}
return
}
if objs == nil || len(objs) == 0 {
t.Fatalf("Expected non-nil apply config, got nil")
}

jobSetApply, ok := objs[0].(*jobsetv1alpha2ac.JobSetApplyConfiguration)
if !ok {
t.Fatalf("Expected *JobSetApplyConfiguration, got %T", objs[0])
}

for _, rJob := range jobSetApply.Spec.ReplicatedJobs {
if rJob.Template == nil || rJob.Template.Spec == nil ||
rJob.Template.Spec.Template == nil || rJob.Template.Spec.Template.Spec == nil {
continue
}
for _, c := range rJob.Template.Spec.Template.Spec.Containers {
if tc.wantNilImages {
if c.Image != nil {
t.Errorf("Expected nil image for container %s on resume, got %q", *c.Name, *c.Image)
}
} else {
if c.Image == nil {
t.Errorf("Expected non-nil image for container %s, got nil", *c.Name)
}
}
}
for _, c := range rJob.Template.Spec.Template.Spec.InitContainers {
if tc.wantNilImages {
if c.Image != nil {
t.Errorf("Expected nil image for initContainer %s on resume, got %q", *c.Name, *c.Image)
}
} else {
if c.Image == nil {
t.Errorf("Expected non-nil image for initContainer %s, got nil", *c.Name)
}
}
}
}
})
}
}
Loading