Skip to content

Commit a5b8e0d

Browse files
authored
fix: correct PCSG topology constraint handling for scaled PodGangs (#357)
* Revert "fix: add PCSG topology constraints to scaled PodGangs (#347)" This reverts commit a08e5af. Signed-off-by: Ron Kahn <rkahn@nvidia.com> * fix: update topology constraint handling for scaled PodGangs Signed-off-by: Ron Kahn <rkahn@nvidia.com> * fix: only create PCSG topology constraints when defined Add nil check before creating PCSG topology constraint groups in base PodGang building. This prevents creating empty topology constraints when PCSG has no TopologyConstraint specified, ensuring scaled PodGangs properly fall back to PCS-level constraints. Signed-off-by: Ron Kahn <rkahn@nvidia.com> * test: add case for PCS with nil PCSG topology constraints fallback to pcs Signed-off-by: Ron Kahn <rkahn@nvidia.com> * test: clean up formatting in syncflow_test.go Signed-off-by: Ron Kahn <rkahn@nvidia.com> * test: clean up formatting in syncflow_test.go Signed-off-by: Ron Kahn <rkahn@nvidia.com> * test: remove redundant comments regarding pcsgConstraints in syncflow_test.go Signed-off-by: Ron Kahn <rkahn@nvidia.com> * test: improve formatting and remove redundant comment in syncflow_test.go Signed-off-by: Ron Kahn <rkahn@nvidia.com> * test: clean up formatting in syncflow_test.go Signed-off-by: Ron Kahn <rkahn@nvidia.com> --------- Signed-off-by: Ron Kahn <rkahn@nvidia.com>
1 parent a08e5af commit a5b8e0d

File tree

2 files changed

+67
-22
lines changed

2 files changed

+67
-22
lines changed

operator/internal/controller/podcliqueset/components/podgang/syncflow.go

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ func doBuildBasePodGangPCLQsAndPCSGPackConstraints(sc *syncContext, pcsReplica i
254254
pclqInfos = append(pclqInfos, buildPodCliqueInfo(sc, pclqTemplateSpec, pclqFQN, true))
255255
pclqFQNs = append(pclqFQNs, pclqFQN)
256256
}
257-
if sc.tasEnabled {
257+
if sc.tasEnabled && pcsgConfig.TopologyConstraint != nil {
258258
// For every PCSG a TopologyConstraintGroupConfig is created which has its own TopologyConstraint that is
259259
// defined for PCLQs within the PCSG. For each PCSG replica there is a separate TopologyConstraintGroupConfig.
260260
pcsgPackConstraints = append(pcsgPackConstraints, groveschedulerv1alpha1.TopologyConstraintGroupConfig{
@@ -289,7 +289,6 @@ func (r _resource) buildExpectedScaledPodGangsForPCSG(sc *syncContext, pcsReplic
289289
func doBuildExpectedScaledPodGangForPCSG(sc *syncContext, pcsgFQN string, pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig, pcsgReplica int, podGangIndex int) (*podGangInfo, error) {
290290
var (
291291
pclqInfos = make([]pclqInfo, 0, len(pcsgConfig.CliqueNames))
292-
pclqFQNs = make([]string, 0, len(pcsgConfig.CliqueNames))
293292
topologyConstraint *groveschedulerv1alpha1.TopologyConstraint
294293
)
295294

@@ -301,25 +300,27 @@ func doBuildExpectedScaledPodGangForPCSG(sc *syncContext, pcsgFQN string, pcsgCo
301300
}
302301
pclqFQN := apicommon.GeneratePodCliqueName(apicommon.ResourceNameReplica{Name: pcsgFQN, Replica: pcsgReplica}, pclqName)
303302
pclqInfos = append(pclqInfos, buildPodCliqueInfo(sc, pclqTemplateSpec, pclqFQN, true))
304-
pclqFQNs = append(pclqFQNs, pclqFQN)
305303
}
306304

307-
var pcsgTopologyConstraints []groveschedulerv1alpha1.TopologyConstraintGroupConfig
305+
// For scaled PodGangs, the TopologyConstraint is determined as follows:
306+
// 1. If PCSG has a TopologyConstraint defined, use that for the PodGang's TopologyConstraint
307+
// 2. Else, fall back to PCS-level TopologyConstraint
308+
// no need to set pcsg topology constraint
308309
if sc.tasEnabled {
309-
topologyConstraint = createTopologyPackConstraint(sc, client.ObjectKeyFromObject(sc.pcs), sc.pcs.Spec.Template.TopologyConstraint)
310-
pcsgTopologyConstraint := groveschedulerv1alpha1.TopologyConstraintGroupConfig{
311-
Name: fmt.Sprintf("%s-%d", pcsgFQN, pcsgReplica),
312-
PodGroupNames: pclqFQNs,
313-
TopologyConstraint: createTopologyPackConstraint(sc, types.NamespacedName{Namespace: sc.pcs.Namespace, Name: pcsgFQN}, pcsgConfig.TopologyConstraint),
310+
if pcsgConfig.TopologyConstraint != nil {
311+
topologyConstraint = createTopologyPackConstraint(sc,
312+
types.NamespacedName{Namespace: sc.pcs.Namespace, Name: pcsgFQN}, pcsgConfig.TopologyConstraint)
313+
} else {
314+
// Fall back to PCS-level constraints
315+
topologyConstraint = createTopologyPackConstraint(sc, client.ObjectKeyFromObject(sc.pcs),
316+
sc.pcs.Spec.Template.TopologyConstraint)
314317
}
315-
pcsgTopologyConstraints = []groveschedulerv1alpha1.TopologyConstraintGroupConfig{pcsgTopologyConstraint}
316318
}
317319

318320
pg := &podGangInfo{
319-
fqn: apicommon.CreatePodGangNameFromPCSGFQN(pcsgFQN, podGangIndex),
320-
topologyConstraint: topologyConstraint,
321-
pclqs: pclqInfos,
322-
pcsgTopologyConstraints: pcsgTopologyConstraints,
321+
fqn: apicommon.CreatePodGangNameFromPCSGFQN(pcsgFQN, podGangIndex),
322+
topologyConstraint: topologyConstraint,
323+
pclqs: pclqInfos,
323324
}
324325

325326
return pg, nil

operator/internal/controller/podcliqueset/components/podgang/syncflow_test.go

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -704,14 +704,11 @@ func TestComputeExpectedPodGangsWithTopologyConstraints(t *testing.T) {
704704
},
705705
{
706706
fqn: "test-pcs-0-scaling-group-0",
707-
topologyLevel: &topologyLevelZone,
707+
topologyLevel: &topologyLevelRack,
708708
pclqConstraints: map[string]grovecorev1alpha1.TopologyLevel{
709709
"test-pcs-0-scaling-group-1-decode-leader": topologyLevelHost,
710710
"test-pcs-0-scaling-group-1-decode-worker": topologyLevelHost,
711711
},
712-
pcsgConstraints: map[string]grovecorev1alpha1.TopologyLevel{
713-
"test-pcs-0-scaling-group-1": topologyLevelRack,
714-
},
715712
},
716713
},
717714
},
@@ -770,14 +767,11 @@ func TestComputeExpectedPodGangsWithTopologyConstraints(t *testing.T) {
770767
},
771768
{
772769
fqn: "test-pcs-0-scaling-group-0",
773-
topologyLevel: &topologyLevelZone,
770+
topologyLevel: &topologyLevelRack,
774771
pclqConstraints: map[string]grovecorev1alpha1.TopologyLevel{
775772
"test-pcs-0-scaling-group-1-decode-leader": topologyLevelHost,
776773
"test-pcs-0-scaling-group-1-decode-worker": topologyLevelHost,
777774
},
778-
pcsgConstraints: map[string]grovecorev1alpha1.TopologyLevel{
779-
"test-pcs-0-scaling-group-1": topologyLevelRack,
780-
},
781775
},
782776
},
783777
},
@@ -823,6 +817,56 @@ func TestComputeExpectedPodGangsWithTopologyConstraints(t *testing.T) {
823817
expectedNumPodGangs: 2,
824818
expectedPodGangTopologyConstraints: []expectedPodGangTopologyConstraints{},
825819
},
820+
{
821+
name: "PCS with PCSG where PCSG has nil topology constraints and falls back to PCS level",
822+
tasEnabled: true,
823+
pcsTopologyLevel: &topologyLevelZone,
824+
pclqTemplateSpecs: []*grovecorev1alpha1.PodCliqueTemplateSpec{
825+
{
826+
Name: "decode-leader",
827+
TopologyConstraint: &grovecorev1alpha1.TopologyConstraint{PackDomain: "host"},
828+
Spec: grovecorev1alpha1.PodCliqueSpec{
829+
Replicas: 1,
830+
MinAvailable: ptr.To(int32(1)),
831+
},
832+
},
833+
{
834+
Name: "decode-worker",
835+
TopologyConstraint: &grovecorev1alpha1.TopologyConstraint{PackDomain: "host"},
836+
Spec: grovecorev1alpha1.PodCliqueSpec{
837+
Replicas: 5,
838+
MinAvailable: ptr.To(int32(1)),
839+
},
840+
},
841+
},
842+
pcsgConfigs: []grovecorev1alpha1.PodCliqueScalingGroupConfig{
843+
{
844+
Name: "scaling-group",
845+
Replicas: ptr.To(int32(2)),
846+
MinAvailable: ptr.To(int32(1)),
847+
CliqueNames: []string{"decode-leader", "decode-worker"},
848+
},
849+
},
850+
expectedNumPodGangs: 2,
851+
expectedPodGangTopologyConstraints: []expectedPodGangTopologyConstraints{
852+
{
853+
fqn: "test-pcs-0",
854+
topologyLevel: &topologyLevelZone,
855+
pclqConstraints: map[string]grovecorev1alpha1.TopologyLevel{
856+
"test-pcs-0-scaling-group-0-decode-leader": topologyLevelHost,
857+
"test-pcs-0-scaling-group-0-decode-worker": topologyLevelHost,
858+
},
859+
},
860+
{
861+
fqn: "test-pcs-0-scaling-group-0",
862+
topologyLevel: &topologyLevelZone,
863+
pclqConstraints: map[string]grovecorev1alpha1.TopologyLevel{
864+
"test-pcs-0-scaling-group-1-decode-leader": topologyLevelHost,
865+
"test-pcs-0-scaling-group-1-decode-worker": topologyLevelHost,
866+
},
867+
},
868+
},
869+
},
826870
}
827871

828872
for _, tc := range tests {

0 commit comments

Comments
 (0)