@@ -16,7 +16,6 @@ import (
1616 kaiv1 "github.com/kai-scheduler/KAI-scheduler/pkg/apis/kai/v1"
1717 commonconstants "github.com/kai-scheduler/KAI-scheduler/pkg/common/constants"
1818 "github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/actions/reclaim"
19- "github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api"
2019 "github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/common_info"
2120 "github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/api/pod_status"
2221 "github.com/kai-scheduler/KAI-scheduler/pkg/scheduler/constants"
@@ -57,97 +56,6 @@ func init() {
5756 test_utils .InitTestingInfrastructure ()
5857}
5958
60- func TestUnschedulableDistributedReclaimTopology (t * testing.T ) {
61- defer gock .Off ()
62-
63- ctrl := gomock .NewController (t )
64- defer ctrl .Finish ()
65-
66- params := defaultUnschedulableDistributedReclaimParams (10 )
67- topology := buildUnschedulableDistributedReclaimTopology (params )
68-
69- ssn := test_utils .BuildSession (topology , ctrl )
70- onJobSolutionStartCalls := 0
71- ssn .AddOnJobSolutionStartFn (func () {
72- onJobSolutionStartCalls ++
73- })
74-
75- action := reclaim .New ()
76- action .Execute (ssn )
77-
78- job := ssn .ClusterInfo .PodGroupInfos [common_info .PodGroupID (unschedulableDistributedJobName )]
79- if job == nil {
80- t .Fatalf ("expected distributed job %q in session" , unschedulableDistributedJobName )
81- }
82-
83- if onJobSolutionStartCalls == 0 {
84- t .Fatalf ("expected reclaim to attempt solving for the distributed job" )
85- }
86-
87- if len (job .PodStatusIndex [pod_status .Pending ]) != params .PodsPerDistributedJob {
88- t .Fatalf ("expected %d pending distributed-job tasks, got %d" ,
89- params .PodsPerDistributedJob , len (job .PodStatusIndex [pod_status .Pending ]))
90- }
91-
92- for _ , clusterJob := range ssn .ClusterInfo .PodGroupInfos {
93- if len (clusterJob .PodStatusIndex [pod_status .Releasing ]) != 0 {
94- t .Fatalf ("expected no committed reclaimees, found %d releasing tasks on job %q" ,
95- len (clusterJob .PodStatusIndex [pod_status .Releasing ]), clusterJob .Name )
96- }
97- if len (clusterJob .PodStatusIndex [pod_status .Pipelined ]) != 0 {
98- t .Fatalf ("expected no pipelined tasks after failed reclaim, found %d on job %q" ,
99- len (clusterJob .PodStatusIndex [pod_status .Pipelined ]), clusterJob .Name )
100- }
101- }
102- }
103-
104- func TestDefaultGeneratorPortfolioPreservesTopologyReclaimCoverage (t * testing.T ) {
105- defer gock .Off ()
106-
107- ctrl := gomock .NewController (t )
108- defer ctrl .Finish ()
109-
110- params := defaultUnschedulableDistributedReclaimParams (10 )
111- topology := buildUnschedulableDistributedReclaimTopology (params )
112-
113- ssn := test_utils .BuildSession (topology , ctrl )
114- assertDefaultScenarioGeneratorPortfolio (t , ssn )
115- assertDefaultScenarioSearchBudgets (t , ssn )
116- multiNodeGangEmissions := observeMultiNodeGangScenarios (t , ssn )
117-
118- action := reclaim .New ()
119- action .Execute (ssn )
120-
121- if * multiNodeGangEmissions == 0 {
122- t .Fatalf ("expected default reclaim scenario portfolio to reach %s" , commonconstants .GeneratorMultiNodeGang )
123- }
124-
125- job := ssn .ClusterInfo .PodGroupInfos [common_info .PodGroupID (unschedulableDistributedJobName )]
126- if job == nil {
127- t .Fatalf ("expected distributed job %q in session" , unschedulableDistributedJobName )
128- }
129- if len (job .PodStatusIndex [pod_status .Pending ]) != params .PodsPerDistributedJob {
130- t .Fatalf ("expected %d pending distributed-job tasks, got %d" ,
131- params .PodsPerDistributedJob , len (job .PodStatusIndex [pod_status .Pending ]))
132- }
133- }
134-
135- type unschedulableDistributedReclaimParams struct {
136- NumNodes int
137- GPUsPerNode int
138- PodsPerDistributedJob int
139- RunningJobsPerNode int
140- Queue0DeservedGPUs int
141- Queue1DeservedGPUs int
142- NumberOfCacheBinds int
143- NumberOfCacheEvictions int
144- NumberOfPipelineActions int
145- }
146-
147- const (
148- unschedulableDistributedJobName = "unschedulable-distributed-job"
149- )
150-
15159func BenchmarkReclaimLargeJobs_10Node (b * testing.B ) {
15260 benchmarkReclaimLargeJobs (b , 10 )
15361}
@@ -388,174 +296,3 @@ func buildReclaimTopology(params VeryLargeJobReclaimParams) test_utils.TestTopol
388296 },
389297 }
390298}
391-
392- func defaultUnschedulableDistributedReclaimParams (numNodes int ) unschedulableDistributedReclaimParams {
393- return unschedulableDistributedReclaimParams {
394- NumNodes : numNodes ,
395- GPUsPerNode : 8 ,
396- PodsPerDistributedJob : 10 ,
397- RunningJobsPerNode : 8 ,
398- Queue0DeservedGPUs : (numNodes * 8 ) - (10 * 8 ) + 1 ,
399- Queue1DeservedGPUs : 10 * 8 ,
400- NumberOfCacheBinds : 0 ,
401- NumberOfCacheEvictions : 0 ,
402- NumberOfPipelineActions : 0 ,
403- }
404- }
405-
406- func buildUnschedulableDistributedReclaimTopology (
407- params unschedulableDistributedReclaimParams ,
408- ) test_utils.TestTopologyBasic {
409- return test_utils.TestTopologyBasic {
410- Name : "unschedulable distributed reclaim benchmark" ,
411- Nodes : buildUnschedulableDistributedReclaimNodes (params ),
412- Jobs : buildUnschedulableDistributedReclaimJobs (params ),
413- Queues : []test_utils.TestQueueBasic {
414- {
415- Name : "queue-0" ,
416- DeservedGPUs : float64 (params .Queue0DeservedGPUs ),
417- GPUOverQuotaWeight : 0 ,
418- },
419- {
420- Name : "queue-1" ,
421- DeservedGPUs : float64 (params .Queue1DeservedGPUs ),
422- GPUOverQuotaWeight : 0 ,
423- },
424- },
425- Mocks : & test_utils.TestMock {
426- CacheRequirements : & test_utils.CacheMocking {
427- NumberOfCacheBinds : params .NumberOfCacheBinds ,
428- NumberOfCacheEvictions : params .NumberOfCacheEvictions ,
429- NumberOfPipelineActions : params .NumberOfPipelineActions ,
430- },
431- },
432- }
433- }
434-
435- func buildUnschedulableDistributedReclaimNodes (
436- params unschedulableDistributedReclaimParams ,
437- ) map [string ]nodes_fake.TestNodeBasic {
438- nodes := make (map [string ]nodes_fake.TestNodeBasic , params .NumNodes )
439- for i := 0 ; i < params .NumNodes ; i ++ {
440- nodes [fmt .Sprintf ("node%d" , i )] = nodes_fake.TestNodeBasic {
441- GPUs : params .GPUsPerNode ,
442- }
443- }
444- return nodes
445- }
446-
447- func buildUnschedulableDistributedReclaimJobs (
448- params unschedulableDistributedReclaimParams ,
449- ) []* jobs_fake.TestJobBasic {
450- runningJobCount := params .NumNodes * params .RunningJobsPerNode
451- jobs := make ([]* jobs_fake.TestJobBasic , 0 , runningJobCount + 1 )
452- for i := 0 ; i < runningJobCount ; i ++ {
453- jobs = append (jobs , & jobs_fake.TestJobBasic {
454- Name : fmt .Sprintf ("running-job-%d" , i ),
455- RequiredGPUsPerTask : 1 ,
456- Priority : constants .PriorityTrainNumber ,
457- QueueName : "queue-0" ,
458- Tasks : []* tasks_fake.TestTaskBasic {
459- {
460- NodeName : fmt .Sprintf ("node%d" , i % params .NumNodes ),
461- State : pod_status .Running ,
462- },
463- },
464- })
465- }
466-
467- distributedJob := & jobs_fake.TestJobBasic {
468- Name : unschedulableDistributedJobName ,
469- RequiredGPUsPerTask : float64 (params .GPUsPerNode ),
470- Priority : constants .PriorityTrainNumber ,
471- QueueName : "queue-1" ,
472- Tasks : make ([]* tasks_fake.TestTaskBasic , params .PodsPerDistributedJob ),
473- }
474- for i := 0 ; i < params .PodsPerDistributedJob ; i ++ {
475- distributedJob .Tasks [i ] = & tasks_fake.TestTaskBasic {
476- State : pod_status .Pending ,
477- }
478- }
479-
480- jobs = append (jobs , distributedJob )
481- return jobs
482- }
483-
484- func assertDefaultScenarioGeneratorPortfolio (t * testing.T , ssn * framework.Session ) {
485- t .Helper ()
486-
487- for _ , expectedGenerator := range []string {
488- commonconstants .GeneratorNodeLocalGreedy ,
489- commonconstants .GeneratorMultiNodeGang ,
490- } {
491- foundGenerator := false
492- for _ , registration := range ssn .ScenarioGeneratorRegistrations {
493- if registration .Name != expectedGenerator {
494- continue
495- }
496- foundGenerator = true
497- break
498- }
499- if ! foundGenerator {
500- t .Fatalf ("expected default scenario generator plugins to register %q" , expectedGenerator )
501- }
502- }
503- }
504-
505- func observeMultiNodeGangScenarios (t * testing.T , ssn * framework.Session ) * int {
506- t .Helper ()
507-
508- for index , registration := range ssn .ScenarioGeneratorRegistrations {
509- if registration .Name != commonconstants .GeneratorMultiNodeGang {
510- continue
511- }
512- emissions := 0
513- originalFactory := registration .Factory
514- ssn .ScenarioGeneratorRegistrations [index ].Factory = func (ctx framework.ScenarioGeneratorContext ) framework.ScenarioGenerator {
515- generator := originalFactory (ctx )
516- if generator == nil {
517- return nil
518- }
519- return & observedScenarioGenerator {
520- ScenarioGenerator : generator ,
521- onScenario : func () {
522- emissions ++
523- },
524- }
525- }
526- return & emissions
527- }
528- t .Fatalf ("expected default scenario generator plugins to register %q" , commonconstants .GeneratorMultiNodeGang )
529- return nil
530- }
531-
532- func assertDefaultScenarioSearchBudgets (t * testing.T , ssn * framework.Session ) {
533- t .Helper ()
534-
535- if ssn .Config == nil || ssn .Config .ScenarioSearchBudgets == nil {
536- t .Fatalf ("expected default scenario search budgets on session" )
537- }
538-
539- generatorBudgets := ssn .Config .ScenarioSearchBudgets .MaxGeneratorSearchDuration
540- if got := generatorBudgets [commonconstants .GeneratorNodeLocalGreedy ].Duration ; got != 30 * time .Second {
541- t .Fatalf ("expected default %s budget 30s, got %s" ,
542- commonconstants .GeneratorNodeLocalGreedy , got )
543- }
544- if got := generatorBudgets [commonconstants .GeneratorMultiNodeGang ].Duration ; got != 2 * time .Minute {
545- t .Fatalf ("expected default %s budget 2m, got %s" ,
546- commonconstants .GeneratorMultiNodeGang , got )
547- }
548- }
549-
550- type observedScenarioGenerator struct {
551- framework.ScenarioGenerator
552- onScenario func ()
553- }
554-
555- func (g * observedScenarioGenerator ) Next () api.ScenarioInfo {
556- scenario := g .ScenarioGenerator .Next ()
557- if scenario != nil && g .onScenario != nil {
558- g .onScenario ()
559- }
560- return scenario
561- }
0 commit comments