Skip to content

Commit 095d529

Browse files
authored
Fixes TopologyConstraints for scaled PodGangs (#340)
* Corrected godoc for PodGang to clearly specify the meaning of TopologyConstraint and TopologyConstraintGroupConfigs fields in PodGangSpec. * Fixed computation of expected PodGangs w.r.t topology constraints. * Added unit test for computeExpectedPodGangs for different combinations. Signed-off-by: Madhav Bhargava <[email protected]> --------- Signed-off-by: Madhav Bhargava <[email protected]>
1 parent 7b92d6b commit 095d529

File tree

6 files changed

+464
-198
lines changed

6 files changed

+464
-198
lines changed

docs/api-reference/scheduler-api.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,8 @@ _Appears in:_
8585
| Field | Description | Default | Validation |
8686
| --- | --- | --- | --- |
8787
| `podgroups` _[PodGroup](#podgroup) array_ | PodGroups is a list of member pod groups in the PodGang. | | |
88-
| `topologyConstraint` _[TopologyConstraint](#topologyconstraint)_ | TopologyConstraint defines topology packing constraints for entire pod gang.<br />Translated from PodCliqueSet.TopologyConstraint.<br />Updated by operator on each reconciliation when PodCliqueSet topology constraints change. | | |
89-
| `topologyConstraintGroupConfigs` _[TopologyConstraintGroupConfig](#topologyconstraintgroupconfig) array_ | TopologyConstraintGroupConfigs defines TopologyConstraints for a group of PodGroups when it is a strict subset<br />of total number of PodGroups for topology-aware placement. | | |
88+
| `topologyConstraint` _[TopologyConstraint](#topologyconstraint)_ | TopologyConstraint defines topology packing constraints for entire pod gang.<br />This is the top level topology constraint that applies to all PodGroups in the PodGang.<br />Updated by operator on each reconciliation when PodCliqueSet topology constraints change. | | |
89+
| `topologyConstraintGroupConfigs` _[TopologyConstraintGroupConfig](#topologyconstraintgroupconfig) array_ | TopologyConstraintGroupConfigs defines TopologyConstraints for a strict subset of PodGroups. | | |
9090
| `priorityClassName` _string_ | PriorityClassName is the name of the PriorityClass for the PodGang. | | |
9191
| `reuseReservationRef` _[NamespacedName](#namespacedname)_ | ReuseReservationRef holds the reference to another PodGang resource scheduled previously.<br />During updates, an operator can suggest to reuse the reservation of the previous PodGang for a newer version of the<br />PodGang resource. This is a suggestion for the scheduler and not a requirement that must be met. If the scheduler plugin<br />finds that the reservation done previously was network optimised and there are no better alternatives available, then it<br />will reuse the reservation. If there are better alternatives available, then the scheduler will ignore this suggestion. | | |
9292

operator/internal/controller/podcliqueset/components/podgang/syncflow.go

Lines changed: 73 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import (
2424
"sort"
2525

2626
apicommon "github.com/ai-dynamo/grove/operator/api/common"
27-
apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants"
2827
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
2928
"github.com/ai-dynamo/grove/operator/internal/clustertopology"
3029
"github.com/ai-dynamo/grove/operator/internal/constants"
@@ -186,14 +185,14 @@ func buildExpectedBasePodGangForPCSReplica(sc *syncContext, pcsReplica int) (*po
186185
pg := &podGangInfo{
187186
fqn: podGangFQN,
188187
// TopologyConstraint for the base PodGang comes from the topology constraint defined at the PCS level.
189-
topologyConstraint: createTopologyPackConstraint(sc, apicommonconstants.KindPodCliqueSet, client.ObjectKeyFromObject(sc.pcs), sc.pcs.Spec.Template.TopologyConstraint),
188+
topologyConstraint: createTopologyPackConstraint(sc, client.ObjectKeyFromObject(sc.pcs), sc.pcs.Spec.Template.TopologyConstraint),
190189
}
191190
pclqInfos := make([]pclqInfo, 0, len(sc.pcs.Spec.Template.Cliques))
192191

193192
// Add all standalone PodCliques to the base PodGang PCLQs
194193
pclqInfos = append(pclqInfos, buildStandalonePCLQInfosForBasePodGang(sc, pcsReplica)...)
195194
// Compute PCSG PodCliques and TopologyConstraintGroupConfig's that are part of the base PodGang
196-
pcsgPackConstraints, pcsgPodCliques, err := buildPCSGGroupPackConstraintsAndPCLQsForBasePodGang(sc, pcsReplica)
195+
pcsgPackConstraints, pcsgPodCliques, err := buildPCSGPackConstraintsAndPCLQsForBasePodGang(sc, pcsReplica)
197196
if err != nil {
198197
return nil, fmt.Errorf("failed to build PCSG TopologyConstraintGroupConfigs and PodClique infos for base PodGang %q: %w", podGangFQN, err)
199198
}
@@ -217,15 +216,15 @@ func buildStandalonePCLQInfosForBasePodGang(sc *syncContext, pcsReplica int) []p
217216
return pclqInfos
218217
}
219218

220-
func buildPCSGGroupPackConstraintsAndPCLQsForBasePodGang(sc *syncContext, pcsReplica int) ([]groveschedulerv1alpha1.TopologyConstraintGroupConfig, []pclqInfo, error) {
219+
func buildPCSGPackConstraintsAndPCLQsForBasePodGang(sc *syncContext, pcsReplica int) ([]groveschedulerv1alpha1.TopologyConstraintGroupConfig, []pclqInfo, error) {
221220
var (
222221
pclqInfos []pclqInfo
223222
pcsgPackConstraints []groveschedulerv1alpha1.TopologyConstraintGroupConfig
224223
)
225224
for _, pcsgConfig := range sc.pcs.Spec.Template.PodCliqueScalingGroupConfigs {
226225
// Iterate through replicas of the PCSG that belong to the base PodGang [0, minAvailable-1]
227226
minAvailable := int(*pcsgConfig.MinAvailable)
228-
pcsgTopologyConstraints, pcsgPodCliqueInfos, err := doBuildPCSGGroupPackConstraintsAndPCLQs(sc, pcsReplica, pcsgConfig, 0, minAvailable)
227+
pcsgPodCliqueInfos, pcsgTopologyConstraints, err := doBuildBasePodGangPCLQsAndPCSGPackConstraints(sc, pcsReplica, pcsgConfig, minAvailable)
229228
if err != nil {
230229
return nil, nil, fmt.Errorf("failed to build PCSG TopologyConstraintGroupConfigs and PodClique infos for base PodGang for PCSG %q: %w", pcsgConfig.Name, err)
231230
}
@@ -235,39 +234,15 @@ func buildPCSGGroupPackConstraintsAndPCLQsForBasePodGang(sc *syncContext, pcsRep
235234
return pcsgPackConstraints, pclqInfos, nil
236235
}
237236

238-
func (r _resource) buildExpectedScaledPodGangsForPCSG(sc *syncContext, pcsReplica int) ([]*podGangInfo, error) {
239-
var expectedPodGangs []*podGangInfo
240-
for _, pcsgConfig := range sc.pcs.Spec.Template.PodCliqueScalingGroupConfigs {
241-
pcsgFQN := apicommon.GeneratePodCliqueScalingGroupName(apicommon.ResourceNameReplica{Name: sc.pcs.Name, Replica: pcsReplica}, pcsgConfig.Name)
242-
replicas := sc.determinePCSGReplicas(pcsgFQN, pcsgConfig)
243-
minAvailable := int(*pcsgConfig.MinAvailable)
244-
scaledReplicas := replicas - minAvailable
245-
for podGangIndex, pcsgReplica := 0, minAvailable; podGangIndex < scaledReplicas; podGangIndex, pcsgReplica = podGangIndex+1, pcsgReplica+1 {
246-
podGangName := apicommon.CreatePodGangNameFromPCSGFQN(pcsgFQN, podGangIndex)
247-
pcsgTopologyConstraints, pcsgPodCliqueInfos, err := doBuildPCSGGroupPackConstraintsAndPCLQs(sc, pcsReplica, pcsgConfig, pcsgReplica, pcsgReplica+1)
248-
if err != nil {
249-
return nil, fmt.Errorf("failed to build PCSG TopologyConstraintGroupConfigs and PodClique infos for scaled PodGang %q for PCSG %q replica %d: %w", podGangName, pcsgConfig.Name, pcsgReplica, err)
250-
}
251-
expectedPodGangs = append(expectedPodGangs, &podGangInfo{
252-
fqn: podGangName,
253-
pclqs: pcsgPodCliqueInfos,
254-
pcsgTopologyConstraints: pcsgTopologyConstraints,
255-
})
256-
}
257-
}
258-
return expectedPodGangs, nil
259-
}
260-
261-
// doBuildPCSGGroupPackConstraintsAndPCLQs builds TopologyConstraintGroupConfigs and pclqInfos for a given PCSG and replica range.
262-
// This function is used for both base PodGang and scaled PodGangs.
263-
func doBuildPCSGGroupPackConstraintsAndPCLQs(sc *syncContext, pcsReplica int, pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig, pcsgReplicaStartIndex, pcsgReplicaEndIndex int) ([]groveschedulerv1alpha1.TopologyConstraintGroupConfig, []pclqInfo, error) {
237+
// doBuildBasePodGangPCLQsAndPCSGPackConstraints builds pclqInfos and TopologyConstraintGroupConfigs for a PCSG within a base PodGang.
238+
func doBuildBasePodGangPCLQsAndPCSGPackConstraints(sc *syncContext, pcsReplica int, pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig, minAvailable int) ([]pclqInfo, []groveschedulerv1alpha1.TopologyConstraintGroupConfig, error) {
264239
var (
265240
pclqInfos []pclqInfo
266241
pcsgPackConstraints []groveschedulerv1alpha1.TopologyConstraintGroupConfig
267242
)
268243

269244
pcsgFQN := apicommon.GeneratePodCliqueScalingGroupName(apicommon.ResourceNameReplica{Name: sc.pcs.Name, Replica: pcsReplica}, pcsgConfig.Name)
270-
for replicaIndex := pcsgReplicaStartIndex; replicaIndex < pcsgReplicaEndIndex; replicaIndex++ {
245+
for replicaIndex := 0; replicaIndex < minAvailable; replicaIndex++ {
271246
// Iterate through each PCLQ within the PCSG
272247
pclqFQNs := make([]string, 0, len(pcsgConfig.CliqueNames))
273248
for _, pclqName := range pcsgConfig.CliqueNames {
@@ -285,12 +260,55 @@ func doBuildPCSGGroupPackConstraintsAndPCLQs(sc *syncContext, pcsReplica int, pc
285260
pcsgPackConstraints = append(pcsgPackConstraints, groveschedulerv1alpha1.TopologyConstraintGroupConfig{
286261
Name: fmt.Sprintf("%s-%d", pcsgFQN, replicaIndex),
287262
PodGroupNames: pclqFQNs,
288-
TopologyConstraint: createTopologyPackConstraint(sc, apicommonconstants.KindPodCliqueScalingGroup, types.NamespacedName{Namespace: sc.pcs.Namespace, Name: pcsgFQN}, pcsgConfig.TopologyConstraint),
263+
TopologyConstraint: createTopologyPackConstraint(sc, types.NamespacedName{Namespace: sc.pcs.Namespace, Name: pcsgFQN}, pcsgConfig.TopologyConstraint),
289264
})
290265
}
291266
}
292267

293-
return pcsgPackConstraints, pclqInfos, nil
268+
return pclqInfos, pcsgPackConstraints, nil
269+
}
270+
271+
func (r _resource) buildExpectedScaledPodGangsForPCSG(sc *syncContext, pcsReplica int) ([]*podGangInfo, error) {
272+
var expectedPodGangs []*podGangInfo
273+
for _, pcsgConfig := range sc.pcs.Spec.Template.PodCliqueScalingGroupConfigs {
274+
pcsgFQN := apicommon.GeneratePodCliqueScalingGroupName(apicommon.ResourceNameReplica{Name: sc.pcs.Name, Replica: pcsReplica}, pcsgConfig.Name)
275+
replicas := sc.determinePCSGReplicas(pcsgFQN, pcsgConfig)
276+
minAvailable := int(*pcsgConfig.MinAvailable)
277+
scaledReplicas := replicas - minAvailable
278+
for podGangIndex, pcsgReplica := 0, minAvailable; podGangIndex < scaledReplicas; podGangIndex, pcsgReplica = podGangIndex+1, pcsgReplica+1 {
279+
pg, err := doBuildExpectedScaledPodGangForPCSG(sc, pcsgFQN, pcsgConfig, pcsgReplica, podGangIndex)
280+
if err != nil {
281+
return nil, fmt.Errorf("failed to build expected scaled PodGang for PCSG %q replica %d: %w", pcsgFQN, pcsgReplica, err)
282+
}
283+
expectedPodGangs = append(expectedPodGangs, pg)
284+
}
285+
}
286+
return expectedPodGangs, nil
287+
}
288+
289+
func doBuildExpectedScaledPodGangForPCSG(sc *syncContext, pcsgFQN string, pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig, pcsgReplica int, podGangIndex int) (*podGangInfo, error) {
290+
var (
291+
pclqInfos = make([]pclqInfo, 0, len(pcsgConfig.CliqueNames))
292+
topologyConstraint *groveschedulerv1alpha1.TopologyConstraint
293+
)
294+
295+
// Iterate through each PCLQ within the PCSG
296+
for _, pclqName := range pcsgConfig.CliqueNames {
297+
pclqTemplateSpec := componentutils.FindPodCliqueTemplateSpecByName(sc.pcs, pclqName)
298+
if pclqTemplateSpec == nil {
299+
return nil, fmt.Errorf("PodCliqueScalingGroup %q references a PodClique %q that does not exist in the PodCliqueSet: %v", pcsgConfig.Name, pclqName, client.ObjectKeyFromObject(sc.pcs))
300+
}
301+
pclqFQN := apicommon.GeneratePodCliqueName(apicommon.ResourceNameReplica{Name: pcsgFQN, Replica: pcsgReplica}, pclqName)
302+
pclqInfos = append(pclqInfos, buildPodCliqueInfo(sc, pclqTemplateSpec, pclqFQN, true))
303+
}
304+
topologyConstraint = createTopologyPackConstraint(sc, types.NamespacedName{Namespace: sc.pcs.Namespace, Name: pcsgFQN}, pcsgConfig.TopologyConstraint)
305+
pg := &podGangInfo{
306+
fqn: apicommon.CreatePodGangNameFromPCSGFQN(pcsgFQN, podGangIndex),
307+
topologyConstraint: topologyConstraint,
308+
pclqs: pclqInfos,
309+
}
310+
311+
return pg, nil
294312
}
295313

296314
// buildPodCliqueInfo creates pclqInfo with appropriate replica counts.
@@ -301,36 +319,34 @@ func buildPodCliqueInfo(sc *syncContext, pclqTemplateSpec *grovecorev1alpha1.Pod
301319
replicas: replicas,
302320
minAvailable: *pclqTemplateSpec.Spec.MinAvailable,
303321
}
304-
if sc.tasEnabled {
305-
expectedPCLQ.packConstraint = createTopologyPackConstraint(sc, apicommonconstants.KindPodClique, types.NamespacedName{Namespace: sc.pcs.Namespace, Name: pclqFQN}, pclqTemplateSpec.TopologyConstraint)
306-
}
322+
expectedPCLQ.topologyConstraint = createTopologyPackConstraint(sc, types.NamespacedName{Namespace: sc.pcs.Namespace, Name: pclqFQN}, pclqTemplateSpec.TopologyConstraint)
307323
return expectedPCLQ
308324
}
309325

310326
// createTopologyPackConstraint creates a TopologyPackConstraint based on the sync context and provided parameters for a resource.
311327
// PackConstraints are defined at multiple levels (PodCliqueSet, PodCliqueScalingGroup, PodClique). This function helps create a TopologyPackConstraint for any of these levels.
312-
func createTopologyPackConstraint(sc *syncContext, resourceKind string, nsName types.NamespacedName, requiredTopologyConstraint *grovecorev1alpha1.TopologyConstraint) *groveschedulerv1alpha1.TopologyConstraint {
328+
func createTopologyPackConstraint(sc *syncContext, nsName types.NamespacedName, requiredTopologyConstraint *grovecorev1alpha1.TopologyConstraint) *groveschedulerv1alpha1.TopologyConstraint {
313329
// If Topology aware scheduling is disabled, return nil even if TopologyConstraint is specified.
314-
if !sc.tasEnabled {
330+
if !sc.tasEnabled || requiredTopologyConstraint == nil {
315331
return nil
316332
}
317-
pgPackConstraint := &groveschedulerv1alpha1.TopologyPackConstraint{}
333+
var pgPackConstraint *groveschedulerv1alpha1.TopologyPackConstraint
318334
// If requiredTopologyConstraint is specified, set the required topology key accordingly.
319-
if requiredTopologyConstraint != nil {
320-
requiredTopologyLevel, found := lo.Find(sc.topologyLevels, func(topologyLevel grovecorev1alpha1.TopologyLevel) bool {
321-
return topologyLevel.Domain == requiredTopologyConstraint.PackDomain
322-
})
323-
if !found {
324-
// This can only happen if the ClusterTopology CR has been updated and no longer contains a topology level
325-
// that is being referenced by the resource's TopologyConstraint.
326-
// In the current version it's been decided to log this occurrence and skip setting the required constraint which is equivalent
327-
// to nullifying the required constraint.
328-
sc.logger.Info("required topology domain not found in cluster topology levels, skipping setting required pack constraint", "kind", resourceKind, "namespacedName", nsName, "requiredTopologyConstraint", *requiredTopologyConstraint)
329-
} else {
330-
pgPackConstraint.Required = ptr.To(requiredTopologyLevel.Key)
335+
requiredTopologyLevel, found := lo.Find(sc.topologyLevels, func(topologyLevel grovecorev1alpha1.TopologyLevel) bool {
336+
return topologyLevel.Domain == requiredTopologyConstraint.PackDomain
337+
})
338+
if !found {
339+
// This can only happen if the ClusterTopology CR has been updated and no longer contains a topology level
340+
// that is being referenced by the resource's TopologyConstraint.
341+
// In the current version it's been decided to log this occurrence and skip setting the required constraint which is equivalent
342+
// to nullifying the required constraint.
343+
sc.logger.Info("required topology domain not found in cluster topology levels, skipping setting required pack constraint", "namespacedName", nsName, "requiredTopologyConstraint", *requiredTopologyConstraint)
344+
} else {
345+
pgPackConstraint = &groveschedulerv1alpha1.TopologyPackConstraint{
346+
Required: ptr.To(requiredTopologyLevel.Key),
331347
}
332348
}
333-
return &groveschedulerv1alpha1.TopologyConstraint{PackConstraint: pgPackConstraint}
349+
return lo.Ternary(pgPackConstraint != nil, &groveschedulerv1alpha1.TopologyConstraint{PackConstraint: pgPackConstraint}, nil)
334350
}
335351

336352
// determinePodCliqueReplicas determines replica count considering HPA mutations.
@@ -543,7 +559,7 @@ func createPodGroupsForPodGang(namespace string, pgInfo *podGangInfo) []grovesch
543559
Name: pi.fqn,
544560
PodReferences: namespacedNames,
545561
MinReplicas: pi.minAvailable,
546-
TopologyConstraint: pi.packConstraint,
562+
TopologyConstraint: pi.topologyConstraint,
547563
}
548564
})
549565
return podGroups
@@ -673,7 +689,7 @@ type podGangInfo struct {
673689
fqn string
674690
// pclqs holds the relevant information for all constituent PodCliques for this PodGang.
675691
pclqs []pclqInfo
676-
// packConstraint holds the topology pack constraint applicable at the PodGang level.
692+
// topologyConstraint holds the topology pack constraint applicable at the PodGang level.
677693
// These will be cleared when TAS is disabled.
678694
topologyConstraint *groveschedulerv1alpha1.TopologyConstraint
679695
// pcsgPackConstraints holds the topology pack constraints applicable at the PodCliqueScalingGroup level.
@@ -702,7 +718,7 @@ type pclqInfo struct {
702718
// associatedPodNames are Pod names (having this PodClique as an owner) that have already been associated to this PodGang.
703719
// This will be updated as and when pods are either deleted or new pods are associated.
704720
associatedPodNames []string
705-
// packConstraint holds the topology pack constraint for the PodClique.
721+
// topologyConstraint holds the topology pack constraint for the PodClique.
706722
// These will be cleared when TAS is disabled.
707-
packConstraint *groveschedulerv1alpha1.TopologyConstraint
723+
topologyConstraint *groveschedulerv1alpha1.TopologyConstraint
708724
}

0 commit comments

Comments
 (0)