@@ -24,7 +24,6 @@ import (
2424 "sort"
2525
2626 apicommon "github.com/ai-dynamo/grove/operator/api/common"
27- apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants"
2827 grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
2928 "github.com/ai-dynamo/grove/operator/internal/clustertopology"
3029 "github.com/ai-dynamo/grove/operator/internal/constants"
@@ -186,14 +185,14 @@ func buildExpectedBasePodGangForPCSReplica(sc *syncContext, pcsReplica int) (*po
186185 pg := & podGangInfo {
187186 fqn : podGangFQN ,
188187 // TopologyConstraint for the base PodGang comes from the topology constraint defined at the PCS level.
189- topologyConstraint : createTopologyPackConstraint (sc , apicommonconstants . KindPodCliqueSet , client .ObjectKeyFromObject (sc .pcs ), sc .pcs .Spec .Template .TopologyConstraint ),
188+ topologyConstraint : createTopologyPackConstraint (sc , client .ObjectKeyFromObject (sc .pcs ), sc .pcs .Spec .Template .TopologyConstraint ),
190189 }
191190 pclqInfos := make ([]pclqInfo , 0 , len (sc .pcs .Spec .Template .Cliques ))
192191
193192 // Add all standalone PodCliques to the base PodGang PCLQs
194193 pclqInfos = append (pclqInfos , buildStandalonePCLQInfosForBasePodGang (sc , pcsReplica )... )
195194 // Compute PCSG PodCliques and TopologyConstraintGroupConfig's that are part of the base PodGang
196- pcsgPackConstraints , pcsgPodCliques , err := buildPCSGGroupPackConstraintsAndPCLQsForBasePodGang (sc , pcsReplica )
195+ pcsgPackConstraints , pcsgPodCliques , err := buildPCSGPackConstraintsAndPCLQsForBasePodGang (sc , pcsReplica )
197196 if err != nil {
198197 return nil , fmt .Errorf ("failed to build PCSG TopologyConstraintGroupConfigs and PodClique infos for base PodGang %q: %w" , podGangFQN , err )
199198 }
@@ -217,15 +216,15 @@ func buildStandalonePCLQInfosForBasePodGang(sc *syncContext, pcsReplica int) []p
217216 return pclqInfos
218217}
219218
220- func buildPCSGGroupPackConstraintsAndPCLQsForBasePodGang (sc * syncContext , pcsReplica int ) ([]groveschedulerv1alpha1.TopologyConstraintGroupConfig , []pclqInfo , error ) {
219+ func buildPCSGPackConstraintsAndPCLQsForBasePodGang (sc * syncContext , pcsReplica int ) ([]groveschedulerv1alpha1.TopologyConstraintGroupConfig , []pclqInfo , error ) {
221220 var (
222221 pclqInfos []pclqInfo
223222 pcsgPackConstraints []groveschedulerv1alpha1.TopologyConstraintGroupConfig
224223 )
225224 for _ , pcsgConfig := range sc .pcs .Spec .Template .PodCliqueScalingGroupConfigs {
226225 // Iterate through replicas of the PCSG that belong to the base PodGang [0, minAvailable-1]
227226 minAvailable := int (* pcsgConfig .MinAvailable )
228- pcsgTopologyConstraints , pcsgPodCliqueInfos , err := doBuildPCSGGroupPackConstraintsAndPCLQs (sc , pcsReplica , pcsgConfig , 0 , minAvailable )
227+ pcsgPodCliqueInfos , pcsgTopologyConstraints , err := doBuildBasePodGangPCLQsAndPCSGPackConstraints (sc , pcsReplica , pcsgConfig , minAvailable )
229228 if err != nil {
230229 return nil , nil , fmt .Errorf ("failed to build PCSG TopologyConstraintGroupConfigs and PodClique infos for base PodGang for PCSG %q: %w" , pcsgConfig .Name , err )
231230 }
@@ -235,39 +234,15 @@ func buildPCSGGroupPackConstraintsAndPCLQsForBasePodGang(sc *syncContext, pcsRep
235234 return pcsgPackConstraints , pclqInfos , nil
236235}
237236
238- func (r _resource ) buildExpectedScaledPodGangsForPCSG (sc * syncContext , pcsReplica int ) ([]* podGangInfo , error ) {
239- var expectedPodGangs []* podGangInfo
240- for _ , pcsgConfig := range sc .pcs .Spec .Template .PodCliqueScalingGroupConfigs {
241- pcsgFQN := apicommon .GeneratePodCliqueScalingGroupName (apicommon.ResourceNameReplica {Name : sc .pcs .Name , Replica : pcsReplica }, pcsgConfig .Name )
242- replicas := sc .determinePCSGReplicas (pcsgFQN , pcsgConfig )
243- minAvailable := int (* pcsgConfig .MinAvailable )
244- scaledReplicas := replicas - minAvailable
245- for podGangIndex , pcsgReplica := 0 , minAvailable ; podGangIndex < scaledReplicas ; podGangIndex , pcsgReplica = podGangIndex + 1 , pcsgReplica + 1 {
246- podGangName := apicommon .CreatePodGangNameFromPCSGFQN (pcsgFQN , podGangIndex )
247- pcsgTopologyConstraints , pcsgPodCliqueInfos , err := doBuildPCSGGroupPackConstraintsAndPCLQs (sc , pcsReplica , pcsgConfig , pcsgReplica , pcsgReplica + 1 )
248- if err != nil {
249- return nil , fmt .Errorf ("failed to build PCSG TopologyConstraintGroupConfigs and PodClique infos for scaled PodGang %q for PCSG %q replica %d: %w" , podGangName , pcsgConfig .Name , pcsgReplica , err )
250- }
251- expectedPodGangs = append (expectedPodGangs , & podGangInfo {
252- fqn : podGangName ,
253- pclqs : pcsgPodCliqueInfos ,
254- pcsgTopologyConstraints : pcsgTopologyConstraints ,
255- })
256- }
257- }
258- return expectedPodGangs , nil
259- }
260-
261- // doBuildPCSGGroupPackConstraintsAndPCLQs builds TopologyConstraintGroupConfigs and pclqInfos for a given PCSG and replica range.
262- // This function is used for both base PodGang and scaled PodGangs.
263- func doBuildPCSGGroupPackConstraintsAndPCLQs (sc * syncContext , pcsReplica int , pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig , pcsgReplicaStartIndex , pcsgReplicaEndIndex int ) ([]groveschedulerv1alpha1.TopologyConstraintGroupConfig , []pclqInfo , error ) {
237+ // doBuildBasePodGangPCLQsAndPCSGPackConstraints builds pclqInfos and TopologyConstraintGroupConfigs for a PCSG within a base PodGang.
238+ func doBuildBasePodGangPCLQsAndPCSGPackConstraints (sc * syncContext , pcsReplica int , pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig , minAvailable int ) ([]pclqInfo , []groveschedulerv1alpha1.TopologyConstraintGroupConfig , error ) {
264239 var (
265240 pclqInfos []pclqInfo
266241 pcsgPackConstraints []groveschedulerv1alpha1.TopologyConstraintGroupConfig
267242 )
268243
269244 pcsgFQN := apicommon .GeneratePodCliqueScalingGroupName (apicommon.ResourceNameReplica {Name : sc .pcs .Name , Replica : pcsReplica }, pcsgConfig .Name )
270- for replicaIndex := pcsgReplicaStartIndex ; replicaIndex < pcsgReplicaEndIndex ; replicaIndex ++ {
245+ for replicaIndex := 0 ; replicaIndex < minAvailable ; replicaIndex ++ {
271246 // Iterate through each PCLQ within the PCSG
272247 pclqFQNs := make ([]string , 0 , len (pcsgConfig .CliqueNames ))
273248 for _ , pclqName := range pcsgConfig .CliqueNames {
@@ -285,12 +260,55 @@ func doBuildPCSGGroupPackConstraintsAndPCLQs(sc *syncContext, pcsReplica int, pc
285260 pcsgPackConstraints = append (pcsgPackConstraints , groveschedulerv1alpha1.TopologyConstraintGroupConfig {
286261 Name : fmt .Sprintf ("%s-%d" , pcsgFQN , replicaIndex ),
287262 PodGroupNames : pclqFQNs ,
288- TopologyConstraint : createTopologyPackConstraint (sc , apicommonconstants . KindPodCliqueScalingGroup , types.NamespacedName {Namespace : sc .pcs .Namespace , Name : pcsgFQN }, pcsgConfig .TopologyConstraint ),
263+ TopologyConstraint : createTopologyPackConstraint (sc , types.NamespacedName {Namespace : sc .pcs .Namespace , Name : pcsgFQN }, pcsgConfig .TopologyConstraint ),
289264 })
290265 }
291266 }
292267
293- return pcsgPackConstraints , pclqInfos , nil
268+ return pclqInfos , pcsgPackConstraints , nil
269+ }
270+
271+ func (r _resource ) buildExpectedScaledPodGangsForPCSG (sc * syncContext , pcsReplica int ) ([]* podGangInfo , error ) {
272+ var expectedPodGangs []* podGangInfo
273+ for _ , pcsgConfig := range sc .pcs .Spec .Template .PodCliqueScalingGroupConfigs {
274+ pcsgFQN := apicommon .GeneratePodCliqueScalingGroupName (apicommon.ResourceNameReplica {Name : sc .pcs .Name , Replica : pcsReplica }, pcsgConfig .Name )
275+ replicas := sc .determinePCSGReplicas (pcsgFQN , pcsgConfig )
276+ minAvailable := int (* pcsgConfig .MinAvailable )
277+ scaledReplicas := replicas - minAvailable
278+ for podGangIndex , pcsgReplica := 0 , minAvailable ; podGangIndex < scaledReplicas ; podGangIndex , pcsgReplica = podGangIndex + 1 , pcsgReplica + 1 {
279+ pg , err := doBuildExpectedScaledPodGangForPCSG (sc , pcsgFQN , pcsgConfig , pcsgReplica , podGangIndex )
280+ if err != nil {
281+ return nil , fmt .Errorf ("failed to build expected scaled PodGang for PCSG %q replica %d: %w" , pcsgFQN , pcsgReplica , err )
282+ }
283+ expectedPodGangs = append (expectedPodGangs , pg )
284+ }
285+ }
286+ return expectedPodGangs , nil
287+ }
288+
289+ func doBuildExpectedScaledPodGangForPCSG (sc * syncContext , pcsgFQN string , pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig , pcsgReplica int , podGangIndex int ) (* podGangInfo , error ) {
290+ var (
291+ pclqInfos = make ([]pclqInfo , 0 , len (pcsgConfig .CliqueNames ))
292+ topologyConstraint * groveschedulerv1alpha1.TopologyConstraint
293+ )
294+
295+ // Iterate through each PCLQ within the PCSG
296+ for _ , pclqName := range pcsgConfig .CliqueNames {
297+ pclqTemplateSpec := componentutils .FindPodCliqueTemplateSpecByName (sc .pcs , pclqName )
298+ if pclqTemplateSpec == nil {
299+ return nil , fmt .Errorf ("PodCliqueScalingGroup %q references a PodClique %q that does not exist in the PodCliqueSet: %v" , pcsgConfig .Name , pclqName , client .ObjectKeyFromObject (sc .pcs ))
300+ }
301+ pclqFQN := apicommon .GeneratePodCliqueName (apicommon.ResourceNameReplica {Name : pcsgFQN , Replica : pcsgReplica }, pclqName )
302+ pclqInfos = append (pclqInfos , buildPodCliqueInfo (sc , pclqTemplateSpec , pclqFQN , true ))
303+ }
304+ topologyConstraint = createTopologyPackConstraint (sc , types.NamespacedName {Namespace : sc .pcs .Namespace , Name : pcsgFQN }, pcsgConfig .TopologyConstraint )
305+ pg := & podGangInfo {
306+ fqn : apicommon .CreatePodGangNameFromPCSGFQN (pcsgFQN , podGangIndex ),
307+ topologyConstraint : topologyConstraint ,
308+ pclqs : pclqInfos ,
309+ }
310+
311+ return pg , nil
294312}
295313
296314// buildPodCliqueInfo creates pclqInfo with appropriate replica counts.
@@ -301,36 +319,34 @@ func buildPodCliqueInfo(sc *syncContext, pclqTemplateSpec *grovecorev1alpha1.Pod
301319 replicas : replicas ,
302320 minAvailable : * pclqTemplateSpec .Spec .MinAvailable ,
303321 }
304- if sc .tasEnabled {
305- expectedPCLQ .packConstraint = createTopologyPackConstraint (sc , apicommonconstants .KindPodClique , types.NamespacedName {Namespace : sc .pcs .Namespace , Name : pclqFQN }, pclqTemplateSpec .TopologyConstraint )
306- }
322+ expectedPCLQ .topologyConstraint = createTopologyPackConstraint (sc , types.NamespacedName {Namespace : sc .pcs .Namespace , Name : pclqFQN }, pclqTemplateSpec .TopologyConstraint )
307323 return expectedPCLQ
308324}
309325
310326// createTopologyPackConstraint creates a TopologyPackConstraint based on the sync context and provided parameters for a resource.
311327// PackConstraints are defined at multiple levels (PodCliqueSet, PodCliqueScalingGroup, PodClique). This function helps create a TopologyPackConstraint for any of these levels.
312- func createTopologyPackConstraint (sc * syncContext , resourceKind string , nsName types.NamespacedName , requiredTopologyConstraint * grovecorev1alpha1.TopologyConstraint ) * groveschedulerv1alpha1.TopologyConstraint {
328+ func createTopologyPackConstraint (sc * syncContext , nsName types.NamespacedName , requiredTopologyConstraint * grovecorev1alpha1.TopologyConstraint ) * groveschedulerv1alpha1.TopologyConstraint {
313329 // If Topology aware scheduling is disabled, return nil even if TopologyConstraint is specified.
314- if ! sc .tasEnabled {
330+ if ! sc .tasEnabled || requiredTopologyConstraint == nil {
315331 return nil
316332 }
317- pgPackConstraint := & groveschedulerv1alpha1.TopologyPackConstraint {}
333+ var pgPackConstraint * groveschedulerv1alpha1.TopologyPackConstraint
318334 // If requiredTopologyConstraint is specified, set the required topology key accordingly.
319- if requiredTopologyConstraint != nil {
320- requiredTopologyLevel , found := lo . Find ( sc . topologyLevels , func ( topologyLevel grovecorev1alpha1. TopologyLevel ) bool {
321- return topologyLevel . Domain == requiredTopologyConstraint . PackDomain
322- })
323- if ! found {
324- // This can only happen if the ClusterTopology CR has been updated and no longer contains a topology level
325- // that is being referenced by the resource's TopologyConstraint.
326- // In the current version it's been decided to log this occurrence and skip setting the required constraint which is equivalent
327- // to nullifying the required constraint.
328- sc . logger . Info ( "required topology domain not found in cluster topology levels, skipping setting required pack constraint" , "kind" , resourceKind , "namespacedName" , nsName , "requiredTopologyConstraint" , * requiredTopologyConstraint )
329- } else {
330- pgPackConstraint . Required = ptr .To (requiredTopologyLevel .Key )
335+ requiredTopologyLevel , found := lo . Find ( sc . topologyLevels , func ( topologyLevel grovecorev1alpha1. TopologyLevel ) bool {
336+ return topologyLevel . Domain == requiredTopologyConstraint . PackDomain
337+ })
338+ if ! found {
339+ // This can only happen if the ClusterTopology CR has been updated and no longer contains a topology level
340+ // that is being referenced by the resource's TopologyConstraint.
341+ // In the current version it's been decided to log this occurrence and skip setting the required constraint which is equivalent
342+ // to nullifying the required constraint.
343+ sc . logger . Info ( "required topology domain not found in cluster topology levels, skipping setting required pack constraint" , "namespacedName" , nsName , "requiredTopologyConstraint" , * requiredTopologyConstraint )
344+ } else {
345+ pgPackConstraint = & groveschedulerv1alpha1. TopologyPackConstraint {
346+ Required : ptr .To (requiredTopologyLevel .Key ),
331347 }
332348 }
333- return & groveschedulerv1alpha1.TopologyConstraint {PackConstraint : pgPackConstraint }
349+ return lo . Ternary ( pgPackConstraint != nil , & groveschedulerv1alpha1.TopologyConstraint {PackConstraint : pgPackConstraint }, nil )
334350}
335351
336352// determinePodCliqueReplicas determines replica count considering HPA mutations.
@@ -543,7 +559,7 @@ func createPodGroupsForPodGang(namespace string, pgInfo *podGangInfo) []grovesch
543559 Name : pi .fqn ,
544560 PodReferences : namespacedNames ,
545561 MinReplicas : pi .minAvailable ,
546- TopologyConstraint : pi .packConstraint ,
562+ TopologyConstraint : pi .topologyConstraint ,
547563 }
548564 })
549565 return podGroups
@@ -673,7 +689,7 @@ type podGangInfo struct {
673689 fqn string
674690 // pclqs holds the relevant information for all constituent PodCliques for this PodGang.
675691 pclqs []pclqInfo
676- // packConstraint holds the topology pack constraint applicable at the PodGang level.
692+ // topologyConstraint holds the topology pack constraint applicable at the PodGang level.
677693 // These will be cleared when TAS is disabled.
678694 topologyConstraint * groveschedulerv1alpha1.TopologyConstraint
679695 // pcsgPackConstraints holds the topology pack constraints applicable at the PodCliqueScalingGroup level.
@@ -702,7 +718,7 @@ type pclqInfo struct {
702718 // associatedPodNames are Pod names (having this PodClique as an owner) that have already been associated to this PodGang.
703719 // This will be updated as and when pods are either deleted or new pods are associated.
704720 associatedPodNames []string
705- // packConstraint holds the topology pack constraint for the PodClique.
721+ // topologyConstraint holds the topology pack constraint for the PodClique.
706722 // These will be cleared when TAS is disabled.
707- packConstraint * groveschedulerv1alpha1.TopologyConstraint
723+ topologyConstraint * groveschedulerv1alpha1.TopologyConstraint
708724}
0 commit comments