Skip to content

Commit bdbcd30

Browse files
authored
[TAS] Fix empty hot swap replacement for slices (#6914)
* [TAS] Fix empty hot swap replacement for slices * rm fg
1 parent b3c4426 commit bdbcd30

File tree

2 files changed

+49
-0
lines changed

2 files changed

+49
-0
lines changed

pkg/cache/scheduler/tas_flavor_snapshot.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,9 @@ func (s *TASFlavorSnapshot) findReplacementAssignment(tr *TASPodSetRequests, exi
518518
if reason != "" {
519519
return nil, nil, reason
520520
}
521+
if replacementAssignment == nil || len(replacementAssignment[tr.PodSet.Name].Domains) == 0 {
522+
return nil, nil, fmt.Sprintf("cannot find replacement assignment for unhealthy node: %v", wl.Status.UnhealthyNodes[0].Name)
523+
}
521524
newAssignment := s.mergeTopologyAssignments(replacementAssignment[tr.PodSet.Name], existingAssignment)
522525
return newAssignment, replacementAssignment[tr.PodSet.Name], ""
523526
}

pkg/scheduler/scheduler_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7023,6 +7023,52 @@ func TestScheduleForTAS(t *testing.T) {
70237023
utiltesting.MakeEventRecord("default", "foo", "Admitted", corev1.EventTypeNormal).Obj(),
70247024
},
70257025
},
7026+
"workload with unhealthyNode annotation; second pass; preferred; no fit when using slices; FailFast": {
7027+
nodes: defaultNodes,
7028+
admissionChecks: []kueue.AdmissionCheck{defaultProvCheck},
7029+
topologies: []kueue.Topology{defaultThreeLevelTopology},
7030+
resourceFlavors: []kueue.ResourceFlavor{defaultTASThreeLevelFlavor},
7031+
clusterQueues: []kueue.ClusterQueue{defaultClusterQueue},
7032+
workloads: []kueue.Workload{
7033+
*utiltesting.MakeWorkload("foo", "default").
7034+
UnhealthyNodes("x0").
7035+
Queue("tas-main").
7036+
PodSets(*utiltesting.MakePodSet("one", 2).
7037+
PreferredTopologyRequest(tasBlockLabel).
7038+
SliceSizeTopologyRequest(2).
7039+
SliceRequiredTopologyRequest(tasRackLabel).
7040+
Request(corev1.ResourceCPU, "1").
7041+
Obj()).
7042+
ReserveQuota(
7043+
utiltesting.MakeAdmission("tas-main").
7044+
PodSets(utiltesting.MakePodSetAssignment("one").Count(2).
7045+
Assignment(corev1.ResourceCPU, "tas-default", "2000m").
7046+
TopologyAssignment(utiltesting.MakeTopologyAssignment(utiltas.Levels(&defaultSingleLevelTopology)).
7047+
Domain(utiltesting.MakeTopologyDomainAssignment([]string{"x0"}, 1).Obj()).
7048+
Domain(utiltesting.MakeTopologyDomainAssignment([]string{"x1"}, 1).Obj()).
7049+
Obj()).
7050+
Obj()).
7051+
Obj(),
7052+
).
7053+
Admitted(true).
7054+
Obj(),
7055+
},
7056+
wantNewAssignments: map[workload.Reference]kueue.Admission{
7057+
"default/foo": *utiltesting.MakeAdmission("tas-main").
7058+
PodSets(utiltesting.MakePodSetAssignment("one").Count(2).
7059+
Assignment(corev1.ResourceCPU, "tas-default", "2000m").
7060+
TopologyAssignment(utiltesting.MakeTopologyAssignment(utiltas.Levels(&defaultSingleLevelTopology)).
7061+
Domain(utiltesting.MakeTopologyDomainAssignment([]string{"x0"}, 1).Obj()).
7062+
Domain(utiltesting.MakeTopologyDomainAssignment([]string{"x1"}, 1).Obj()).
7063+
Obj()).
7064+
Obj()).
7065+
Obj(),
7066+
},
7067+
wantEvents: []utiltesting.EventRecord{
7068+
utiltesting.MakeEventRecord("default", "foo", "EvictedDueToNodeFailures", corev1.EventTypeNormal).
7069+
Message("Workload was evicted as there was no replacement for a failed node: x0").Obj(),
7070+
},
7071+
},
70267072
}
70277073
for name, tc := range cases {
70287074
t.Run(name, func(t *testing.T) {

0 commit comments

Comments
 (0)