Skip to content

Commit 87e08e0

Browse files
committed
test(scheduler): split reclaim topology tests from benchmarks
Signed-off-by: Erez Freiberger <enoodle@gmail.com>
1 parent 58671d2 commit 87e08e0

2 files changed

Lines changed: 287 additions & 263 deletions

File tree

pkg/scheduler/actions/integration_tests/reclaim/reclaim_benchmark_test.go

Lines changed: 0 additions & 263 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import (
1616
kaiv1 "github.com/kai-scheduler/KAI-scheduler/pkg/apis/kai/v1"
1717
commonconstants "github.com/kai-scheduler/KAI-scheduler/pkg/common/constants"
1818
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/actions/reclaim"
19-
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api"
2019
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/common_info"
2120
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/pod_status"
2221
"github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/constants"
@@ -57,97 +56,6 @@ func init() {
5756
test_utils.InitTestingInfrastructure()
5857
}
5958

60-
func TestUnschedulableDistributedReclaimTopology(t *testing.T) {
61-
defer gock.Off()
62-
63-
ctrl := gomock.NewController(t)
64-
defer ctrl.Finish()
65-
66-
params := defaultUnschedulableDistributedReclaimParams(10)
67-
topology := buildUnschedulableDistributedReclaimTopology(params)
68-
69-
ssn := test_utils.BuildSession(topology, ctrl)
70-
onJobSolutionStartCalls := 0
71-
ssn.AddOnJobSolutionStartFn(func() {
72-
onJobSolutionStartCalls++
73-
})
74-
75-
action := reclaim.New()
76-
action.Execute(ssn)
77-
78-
job := ssn.ClusterInfo.PodGroupInfos[common_info.PodGroupID(unschedulableDistributedJobName)]
79-
if job == nil {
80-
t.Fatalf("expected distributed job %q in session", unschedulableDistributedJobName)
81-
}
82-
83-
if onJobSolutionStartCalls == 0 {
84-
t.Fatalf("expected reclaim to attempt solving for the distributed job")
85-
}
86-
87-
if len(job.PodStatusIndex[pod_status.Pending]) != params.PodsPerDistributedJob {
88-
t.Fatalf("expected %d pending distributed-job tasks, got %d",
89-
params.PodsPerDistributedJob, len(job.PodStatusIndex[pod_status.Pending]))
90-
}
91-
92-
for _, clusterJob := range ssn.ClusterInfo.PodGroupInfos {
93-
if len(clusterJob.PodStatusIndex[pod_status.Releasing]) != 0 {
94-
t.Fatalf("expected no committed reclaimees, found %d releasing tasks on job %q",
95-
len(clusterJob.PodStatusIndex[pod_status.Releasing]), clusterJob.Name)
96-
}
97-
if len(clusterJob.PodStatusIndex[pod_status.Pipelined]) != 0 {
98-
t.Fatalf("expected no pipelined tasks after failed reclaim, found %d on job %q",
99-
len(clusterJob.PodStatusIndex[pod_status.Pipelined]), clusterJob.Name)
100-
}
101-
}
102-
}
103-
104-
func TestDefaultGeneratorPortfolioPreservesTopologyReclaimCoverage(t *testing.T) {
105-
defer gock.Off()
106-
107-
ctrl := gomock.NewController(t)
108-
defer ctrl.Finish()
109-
110-
params := defaultUnschedulableDistributedReclaimParams(10)
111-
topology := buildUnschedulableDistributedReclaimTopology(params)
112-
113-
ssn := test_utils.BuildSession(topology, ctrl)
114-
assertDefaultScenarioGeneratorPortfolio(t, ssn)
115-
assertDefaultScenarioSearchBudgets(t, ssn)
116-
multiNodeGangEmissions := observeMultiNodeGangScenarios(t, ssn)
117-
118-
action := reclaim.New()
119-
action.Execute(ssn)
120-
121-
if *multiNodeGangEmissions == 0 {
122-
t.Fatalf("expected default reclaim scenario portfolio to reach %s", commonconstants.GeneratorMultiNodeGang)
123-
}
124-
125-
job := ssn.ClusterInfo.PodGroupInfos[common_info.PodGroupID(unschedulableDistributedJobName)]
126-
if job == nil {
127-
t.Fatalf("expected distributed job %q in session", unschedulableDistributedJobName)
128-
}
129-
if len(job.PodStatusIndex[pod_status.Pending]) != params.PodsPerDistributedJob {
130-
t.Fatalf("expected %d pending distributed-job tasks, got %d",
131-
params.PodsPerDistributedJob, len(job.PodStatusIndex[pod_status.Pending]))
132-
}
133-
}
134-
135-
type unschedulableDistributedReclaimParams struct {
136-
NumNodes int
137-
GPUsPerNode int
138-
PodsPerDistributedJob int
139-
RunningJobsPerNode int
140-
Queue0DeservedGPUs int
141-
Queue1DeservedGPUs int
142-
NumberOfCacheBinds int
143-
NumberOfCacheEvictions int
144-
NumberOfPipelineActions int
145-
}
146-
147-
const (
148-
unschedulableDistributedJobName = "unschedulable-distributed-job"
149-
)
150-
15159
func BenchmarkReclaimLargeJobs_10Node(b *testing.B) {
15260
benchmarkReclaimLargeJobs(b, 10)
15361
}
@@ -388,174 +296,3 @@ func buildReclaimTopology(params VeryLargeJobReclaimParams) test_utils.TestTopol
388296
},
389297
}
390298
}
391-
392-
func defaultUnschedulableDistributedReclaimParams(numNodes int) unschedulableDistributedReclaimParams {
393-
return unschedulableDistributedReclaimParams{
394-
NumNodes: numNodes,
395-
GPUsPerNode: 8,
396-
PodsPerDistributedJob: 10,
397-
RunningJobsPerNode: 8,
398-
Queue0DeservedGPUs: (numNodes * 8) - (10 * 8) + 1,
399-
Queue1DeservedGPUs: 10 * 8,
400-
NumberOfCacheBinds: 0,
401-
NumberOfCacheEvictions: 0,
402-
NumberOfPipelineActions: 0,
403-
}
404-
}
405-
406-
func buildUnschedulableDistributedReclaimTopology(
407-
params unschedulableDistributedReclaimParams,
408-
) test_utils.TestTopologyBasic {
409-
return test_utils.TestTopologyBasic{
410-
Name: "unschedulable distributed reclaim benchmark",
411-
Nodes: buildUnschedulableDistributedReclaimNodes(params),
412-
Jobs: buildUnschedulableDistributedReclaimJobs(params),
413-
Queues: []test_utils.TestQueueBasic{
414-
{
415-
Name: "queue-0",
416-
DeservedGPUs: float64(params.Queue0DeservedGPUs),
417-
GPUOverQuotaWeight: 0,
418-
},
419-
{
420-
Name: "queue-1",
421-
DeservedGPUs: float64(params.Queue1DeservedGPUs),
422-
GPUOverQuotaWeight: 0,
423-
},
424-
},
425-
Mocks: &test_utils.TestMock{
426-
CacheRequirements: &test_utils.CacheMocking{
427-
NumberOfCacheBinds: params.NumberOfCacheBinds,
428-
NumberOfCacheEvictions: params.NumberOfCacheEvictions,
429-
NumberOfPipelineActions: params.NumberOfPipelineActions,
430-
},
431-
},
432-
}
433-
}
434-
435-
func buildUnschedulableDistributedReclaimNodes(
436-
params unschedulableDistributedReclaimParams,
437-
) map[string]nodes_fake.TestNodeBasic {
438-
nodes := make(map[string]nodes_fake.TestNodeBasic, params.NumNodes)
439-
for i := 0; i < params.NumNodes; i++ {
440-
nodes[fmt.Sprintf("node%d", i)] = nodes_fake.TestNodeBasic{
441-
GPUs: params.GPUsPerNode,
442-
}
443-
}
444-
return nodes
445-
}
446-
447-
func buildUnschedulableDistributedReclaimJobs(
448-
params unschedulableDistributedReclaimParams,
449-
) []*jobs_fake.TestJobBasic {
450-
runningJobCount := params.NumNodes * params.RunningJobsPerNode
451-
jobs := make([]*jobs_fake.TestJobBasic, 0, runningJobCount+1)
452-
for i := 0; i < runningJobCount; i++ {
453-
jobs = append(jobs, &jobs_fake.TestJobBasic{
454-
Name: fmt.Sprintf("running-job-%d", i),
455-
RequiredGPUsPerTask: 1,
456-
Priority: constants.PriorityTrainNumber,
457-
QueueName: "queue-0",
458-
Tasks: []*tasks_fake.TestTaskBasic{
459-
{
460-
NodeName: fmt.Sprintf("node%d", i%params.NumNodes),
461-
State: pod_status.Running,
462-
},
463-
},
464-
})
465-
}
466-
467-
distributedJob := &jobs_fake.TestJobBasic{
468-
Name: unschedulableDistributedJobName,
469-
RequiredGPUsPerTask: float64(params.GPUsPerNode),
470-
Priority: constants.PriorityTrainNumber,
471-
QueueName: "queue-1",
472-
Tasks: make([]*tasks_fake.TestTaskBasic, params.PodsPerDistributedJob),
473-
}
474-
for i := 0; i < params.PodsPerDistributedJob; i++ {
475-
distributedJob.Tasks[i] = &tasks_fake.TestTaskBasic{
476-
State: pod_status.Pending,
477-
}
478-
}
479-
480-
jobs = append(jobs, distributedJob)
481-
return jobs
482-
}
483-
484-
func assertDefaultScenarioGeneratorPortfolio(t *testing.T, ssn *framework.Session) {
485-
t.Helper()
486-
487-
for _, expectedGenerator := range []string{
488-
commonconstants.GeneratorNodeLocalGreedy,
489-
commonconstants.GeneratorMultiNodeGang,
490-
} {
491-
foundGenerator := false
492-
for _, registration := range ssn.ScenarioGeneratorRegistrations {
493-
if registration.Name != expectedGenerator {
494-
continue
495-
}
496-
foundGenerator = true
497-
break
498-
}
499-
if !foundGenerator {
500-
t.Fatalf("expected default scenario generator plugins to register %q", expectedGenerator)
501-
}
502-
}
503-
}
504-
505-
func observeMultiNodeGangScenarios(t *testing.T, ssn *framework.Session) *int {
506-
t.Helper()
507-
508-
for index, registration := range ssn.ScenarioGeneratorRegistrations {
509-
if registration.Name != commonconstants.GeneratorMultiNodeGang {
510-
continue
511-
}
512-
emissions := 0
513-
originalFactory := registration.Factory
514-
ssn.ScenarioGeneratorRegistrations[index].Factory = func(ctx framework.ScenarioGeneratorContext) framework.ScenarioGenerator {
515-
generator := originalFactory(ctx)
516-
if generator == nil {
517-
return nil
518-
}
519-
return &observedScenarioGenerator{
520-
ScenarioGenerator: generator,
521-
onScenario: func() {
522-
emissions++
523-
},
524-
}
525-
}
526-
return &emissions
527-
}
528-
t.Fatalf("expected default scenario generator plugins to register %q", commonconstants.GeneratorMultiNodeGang)
529-
return nil
530-
}
531-
532-
func assertDefaultScenarioSearchBudgets(t *testing.T, ssn *framework.Session) {
533-
t.Helper()
534-
535-
if ssn.Config == nil || ssn.Config.ScenarioSearchBudgets == nil {
536-
t.Fatalf("expected default scenario search budgets on session")
537-
}
538-
539-
generatorBudgets := ssn.Config.ScenarioSearchBudgets.MaxGeneratorSearchDuration
540-
if got := generatorBudgets[commonconstants.GeneratorNodeLocalGreedy].Duration; got != 30*time.Second {
541-
t.Fatalf("expected default %s budget 30s, got %s",
542-
commonconstants.GeneratorNodeLocalGreedy, got)
543-
}
544-
if got := generatorBudgets[commonconstants.GeneratorMultiNodeGang].Duration; got != 2*time.Minute {
545-
t.Fatalf("expected default %s budget 2m, got %s",
546-
commonconstants.GeneratorMultiNodeGang, got)
547-
}
548-
}
549-
550-
type observedScenarioGenerator struct {
551-
framework.ScenarioGenerator
552-
onScenario func()
553-
}
554-
555-
func (g *observedScenarioGenerator) Next() api.ScenarioInfo {
556-
scenario := g.ScenarioGenerator.Next()
557-
if scenario != nil && g.onScenario != nil {
558-
g.onScenario()
559-
}
560-
return scenario
561-
}

0 commit comments

Comments
 (0)