@@ -2065,6 +2065,102 @@ test-job-node-0-1.test-job slots=8
20652065 Obj (),
20662066 },
20672067 },
2068+ "succeeded to build PodGroup and JobSet with multiple replicas for non-trainer replicatedJob." : {
2069+ trainingRuntime : testingutil .MakeTrainingRuntimeWrapper (metav1 .NamespaceDefault , "test-runtime" ).
2070+ RuntimeSpec (
2071+ testingutil .MakeTrainingRuntimeSpecWrapper (testingutil .MakeTrainingRuntimeWrapper (metav1 .NamespaceDefault , "test-runtime" ).Spec ).
2072+ WithMLPolicy (
2073+ testingutil .MakeMLPolicyWrapper ().
2074+ WithNumNodes (10 ).
2075+ Obj (),
2076+ ).
2077+ PodGroupPolicyCoschedulingSchedulingTimeout (120 ).
2078+ Replicas (3 , constants .DatasetInitializer ).
2079+ Container (constants .DatasetInitializer , constants .DatasetInitializer , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2080+ Container (constants .ModelInitializer , constants .ModelInitializer , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2081+ Container (constants .Node , constants .Node , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2082+ Obj (),
2083+ ).Obj (),
2084+ trainJob : testingutil .MakeTrainJobWrapper (metav1 .NamespaceDefault , "test-job" ).
2085+ UID ("uid" ).
2086+ RuntimeRef (trainer .SchemeGroupVersion .WithKind (trainer .TrainingRuntimeKind ), "test-runtime" ).
2087+ Trainer (
2088+ testingutil .MakeTrainJobTrainerWrapper ().
2089+ NumNodes (10 ).
2090+ Obj (),
2091+ ).
2092+ Obj (),
2093+ wantObjs : []runtime.Object {
2094+ testingutil .MakeJobSetWrapper (metav1 .NamespaceDefault , "test-job" ).
2095+ ControllerReference (trainer .SchemeGroupVersion .WithKind (trainer .TrainJobKind ), "test-job" , "uid" ).
2096+ Replicas (3 , constants .DatasetInitializer ).
2097+ Replicas (1 , constants .ModelInitializer , constants .Node , constants .Launcher ).
2098+ Parallelism (1 , constants .DatasetInitializer , constants .ModelInitializer , constants .Launcher ).
2099+ Completions (1 , constants .DatasetInitializer , constants .ModelInitializer , constants .Launcher ).
2100+ NumNodes (10 ).
2101+ Container (constants .DatasetInitializer , constants .DatasetInitializer , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2102+ Container (constants .ModelInitializer , constants .ModelInitializer , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2103+ Container (constants .Node , constants .Node , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2104+ PodLabel (schedulerpluginsv1alpha1 .PodGroupLabel , "test-job" ).
2105+ Obj (),
2106+ testingutil .MakeSchedulerPluginsPodGroup (metav1 .NamespaceDefault , "test-job" ).
2107+ ControllerReference (trainer .SchemeGroupVersion .WithKind (trainer .TrainJobKind ), "test-job" , "uid" ).
2108+ MinMember (14 ). // 14 = 10 (trainer nodes) + 3 (DatasetInitializer replicas * 1 pod each) + 1 (ModelInitializer)
2109+ MinResources (corev1.ResourceList {
2110+ corev1 .ResourceCPU : resource .MustParse ("14" ),
2111+ }).
2112+ SchedulingTimeout (120 ).
2113+ Obj (),
2114+ },
2115+ },
2116+ "succeeded to build PodGroup and JobSet with trainer replicatedJob Replicas ignored when NumNodes is set." : {
2117+ trainingRuntime : testingutil .MakeTrainingRuntimeWrapper (metav1 .NamespaceDefault , "test-runtime" ).
2118+ RuntimeSpec (
2119+ testingutil .MakeTrainingRuntimeSpecWrapper (testingutil .MakeTrainingRuntimeWrapper (metav1 .NamespaceDefault , "test-runtime" ).Spec ).
2120+ WithMLPolicy (
2121+ testingutil .MakeMLPolicyWrapper ().
2122+ WithNumNodes (100 ).
2123+ Obj (),
2124+ ).
2125+ PodGroupPolicyCoschedulingSchedulingTimeout (120 ).
2126+ Replicas (4 , constants .Node ).
2127+ Container (constants .DatasetInitializer , constants .DatasetInitializer , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2128+ Container (constants .ModelInitializer , constants .ModelInitializer , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2129+ Container (constants .Node , constants .Node , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2130+ Obj (),
2131+ ).Obj (),
2132+ trainJob : testingutil .MakeTrainJobWrapper (metav1 .NamespaceDefault , "test-job" ).
2133+ UID ("uid" ).
2134+ RuntimeRef (trainer .SchemeGroupVersion .WithKind (trainer .TrainingRuntimeKind ), "test-runtime" ).
2135+ Trainer (
2136+ testingutil .MakeTrainJobTrainerWrapper ().
2137+ NumNodes (5 ).
2138+ Obj (),
2139+ ).
2140+ Obj (),
2141+ wantObjs : []runtime.Object {
2142+ testingutil .MakeJobSetWrapper (metav1 .NamespaceDefault , "test-job" ).
2143+ ControllerReference (trainer .SchemeGroupVersion .WithKind (trainer .TrainJobKind ), "test-job" , "uid" ).
2144+ Replicas (4 , constants .Node ).
2145+ Replicas (1 , constants .DatasetInitializer , constants .ModelInitializer , constants .Launcher ).
2146+ Parallelism (1 , constants .DatasetInitializer , constants .ModelInitializer , constants .Launcher ).
2147+ Completions (1 , constants .DatasetInitializer , constants .ModelInitializer , constants .Launcher ).
2148+ NumNodes (5 ).
2149+ Container (constants .DatasetInitializer , constants .DatasetInitializer , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2150+ Container (constants .ModelInitializer , constants .ModelInitializer , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2151+ Container (constants .Node , constants .Node , "test:runtime" , []string {"runtime" }, []string {"runtime" }, resRequests ).
2152+ PodLabel (schedulerpluginsv1alpha1 .PodGroupLabel , "test-job" ).
2153+ Obj (),
2154+ testingutil .MakeSchedulerPluginsPodGroup (metav1 .NamespaceDefault , "test-job" ).
2155+ ControllerReference (trainer .SchemeGroupVersion .WithKind (trainer .TrainJobKind ), "test-job" , "uid" ).
2156+ MinMember (7 ). // 7 = 5 (NumNodes, NOT 5*4=20) + 1 (DatasetInitializer) + 1 (ModelInitializer)
2157+ MinResources (corev1.ResourceList {
2158+ corev1 .ResourceCPU : resource .MustParse ("7" ),
2159+ }).
2160+ SchedulingTimeout (120 ).
2161+ Obj (),
2162+ },
2163+ },
20682164 // Failed test cases.
20692165 "missing trainingRuntime resource" : {
20702166 trainJob : testingutil .MakeTrainJobWrapper (metav1 .NamespaceDefault , "test-job-3" ).
0 commit comments