Skip to content

Commit 9640779

Browse files
authored
Remove deadlock when deploying PCS with ComputeDomain (#215)
* Check scheduled pods against minAvailable for SchedulingGate removal for pods belonging to scaled pod gangs. Previously we used to check for ready pods. This is required to remove deadlock when using computing domain (DRA). * Update docstring for PodCliqueScalingGroupConfig.MinAvailable * Regenerated the api docs and crds Signed-off-by: Madhav Bhargava <madhav.bhargava@sap.com> --------- Signed-off-by: Madhav Bhargava <madhav.bhargava@sap.com>
1 parent e8050ac commit 9640779

File tree

5 files changed

+68
-62
lines changed

5 files changed

+68
-62
lines changed

docs/api-reference/operator-api.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ _Appears in:_
233233
| `name` _string_ | Name is the name of the PodCliqueScalingGroupConfig. This should be unique within the PodCliqueSet.<br />It allows consumers to give a semantic name to a group of PodCliques that needs to be scaled together. | | |
234234
| `cliqueNames` _string array_ | CliqueNames is the list of names of the PodClique's that are part of the scaling group. | | |
235235
| `replicas` _integer_ | Replicas is the desired number of replicas for the scaling group at template level.<br />This allows one to control the replicas of the scaling group at startup.<br />If not specified, it defaults to 1. | 1 | |
236-
| `minAvailable` _integer_ | MinAvailable specifies the minimum number of ready replicas required for a PodCliqueScalingGroup to be considered operational.<br />A PodCliqueScalingGroup replica is considered "ready" when its associated PodCliques have sufficient ready or starting pods.<br />If MinAvailable is breached, it will be used to signal that the PodCliqueScalingGroup is no longer operating with the desired availability.<br />MinAvailable cannot be greater than Replicas. If ScaleConfig is defined then its MinAvailable should not be less than ScaleConfig.MinReplicas.<br />If not specified, it defaults to 1. | 1 | |
236+
| `minAvailable` _integer_ | MinAvailable serves two purposes:<br />Gang Scheduling:<br />It defines the minimum number of replicas that are guaranteed to be gang scheduled.<br />Gang Termination:<br />It defines the minimum requirement of available replicas for a PodCliqueScalingGroup.<br />Violation of this threshold for a duration beyond TerminationDelay will result in termination of the PodCliqueSet replica that it belongs to.<br />Default: If not specified, it defaults to 1.<br />Constraints:<br />MinAvailable cannot be greater than Replicas.<br />If ScaleConfig is defined then its MinAvailable should not be less than ScaleConfig.MinReplicas. | 1 | |
237237
| `scaleConfig` _[AutoScalingConfig](#autoscalingconfig)_ | ScaleConfig is the horizontal pod autoscaler configuration for the pod clique scaling group. | | |
238238

239239

operator/api/core/v1alpha1/crds/grove.io_podcliquesets.yaml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9071,11 +9071,16 @@ spec:
90719071
minAvailable:
90729072
default: 1
90739073
description: |-
9074-
MinAvailable specifies the minimum number of ready replicas required for a PodCliqueScalingGroup to be considered operational.
9075-
A PodCliqueScalingGroup replica is considered "ready" when its associated PodCliques have sufficient ready or starting pods.
9076-
If MinAvailable is breached, it will be used to signal that the PodCliqueScalingGroup is no longer operating with the desired availability.
9077-
MinAvailable cannot be greater than Replicas. If ScaleConfig is defined then its MinAvailable should not be less than ScaleConfig.MinReplicas.
9078-
If not specified, it defaults to 1.
9074+
MinAvailable serves two purposes:
9075+
Gang Scheduling:
9076+
It defines the minimum number of replicas that are guaranteed to be gang scheduled.
9077+
Gang Termination:
9078+
It defines the minimum requirement of available replicas for a PodCliqueScalingGroup.
9079+
Violation of this threshold for a duration beyond TerminationDelay will result in termination of the PodCliqueSet replica that it belongs to.
9080+
Default: If not specified, it defaults to 1.
9081+
Constraints:
9082+
MinAvailable cannot be greater than Replicas.
9083+
If ScaleConfig is defined then its MinAvailable should not be less than ScaleConfig.MinReplicas.
90799084
format: int32
90809085
type: integer
90819086
name:

operator/api/core/v1alpha1/podcliqueset.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,11 +216,16 @@ type PodCliqueScalingGroupConfig struct {
216216
// +optional
217217
// +kubebuilder:default=1
218218
Replicas *int32 `json:"replicas,omitempty"`
219-
// MinAvailable specifies the minimum number of ready replicas required for a PodCliqueScalingGroup to be considered operational.
220-
// A PodCliqueScalingGroup replica is considered "ready" when its associated PodCliques have sufficient ready or starting pods.
221-
// If MinAvailable is breached, it will be used to signal that the PodCliqueScalingGroup is no longer operating with the desired availability.
222-
// MinAvailable cannot be greater than Replicas. If ScaleConfig is defined then its MinAvailable should not be less than ScaleConfig.MinReplicas.
223-
// If not specified, it defaults to 1.
219+
// MinAvailable serves two purposes:
220+
// Gang Scheduling:
221+
// It defines the minimum number of replicas that are guaranteed to be gang scheduled.
222+
// Gang Termination:
223+
// It defines the minimum requirement of available replicas for a PodCliqueScalingGroup.
224+
// Violation of this threshold for a duration beyond TerminationDelay will result in termination of the PodCliqueSet replica that it belongs to.
225+
// Default: If not specified, it defaults to 1.
226+
// Constraints:
227+
// MinAvailable cannot be greater than Replicas.
228+
// If ScaleConfig is defined then its MinAvailable should not be less than ScaleConfig.MinReplicas.
224229
// +optional
225230
// +kubebuilder:default=1
226231
MinAvailable *int32 `json:"minAvailable,omitempty"`

operator/internal/controller/podclique/components/pod/syncflow.go

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -240,15 +240,15 @@ func (r _resource) checkAndRemovePodSchedulingGates(sc *syncContext, logger logr
240240
tasks := make([]utils.Task, 0, len(sc.existingPCLQPods))
241241
skippedScheduleGatedPods := make([]string, 0, len(sc.existingPCLQPods))
242242

243-
// Pre-compute base PodGang readiness once for all pods in this PodClique
243+
// Pre-compute if the base PodGang is scheduled once for all pods in this PodClique
244244
// All pods in the same PodClique have the same base PodGang
245-
basePodGangReady, basePodGangName, err := r.checkBasePodGangReadinessForPodClique(sc.ctx, logger, sc.pclq)
245+
basePodGangScheduled, basePodGangName, err := r.checkBasePodGangScheduledForPodClique(sc.ctx, logger, sc.pclq)
246246
if err != nil {
247-
logger.Error(err, "Error checking base PodGang readiness for PodClique - will requeue")
247+
logger.Error(err, "Error checking if base PodGang is scheduled for PodClique - will requeue")
248248
return nil, groveerr.WrapError(err,
249249
errCodeRemovePodSchedulingGate,
250250
component.OperationSync,
251-
"failed to check base PodGang readiness for PodClique",
251+
"failed to check if base PodGang is scheduled for PodClique",
252252
)
253253
}
254254

@@ -260,7 +260,7 @@ func (r _resource) checkAndRemovePodSchedulingGates(sc *syncContext, logger logr
260260
skippedScheduleGatedPods = append(skippedScheduleGatedPods, p.Name)
261261
continue
262262
}
263-
shouldSkip := r.shouldSkipPodSchedulingGateRemoval(logger, p, basePodGangReady, basePodGangName)
263+
shouldSkip := r.shouldSkipPodSchedulingGateRemoval(logger, p, basePodGangScheduled, basePodGangName)
264264
if shouldSkip {
265265
skippedScheduleGatedPods = append(skippedScheduleGatedPods, p.Name)
266266
continue
@@ -297,10 +297,10 @@ func (r _resource) checkAndRemovePodSchedulingGates(sc *syncContext, logger logr
297297
return skippedScheduleGatedPods, nil
298298
}
299299

300-
// isBasePodGangReady checks if the base PodGang (identified by name) is ready, returning errors for API failures.
301-
// A base PodGang is considered "ready" when ALL of its constituent PodCliques have achieved
302-
// their minimum required number of ready pods (PodClique.Status.ReadyReplicas >= PodGroup.MinReplicas).
303-
func (r _resource) isBasePodGangReady(ctx context.Context, logger logr.Logger, namespace, basePodGangName string) (bool, error) {
300+
// isBasePodGangScheduled checks if the base PodGang (identified by name) is scheduled, returning errors for API failures.
301+
// A base PodGang is considered "scheduled" when ALL of its constituent PodCliques have achieved
302+
// their minimum required number of scheduled pods (PodClique.Status.ScheduledReplicas >= PodGroup.MinReplicas).
303+
func (r _resource) isBasePodGangScheduled(ctx context.Context, logger logr.Logger, namespace, basePodGangName string) (bool, error) {
304304
// Get the base PodGang - treat all errors (including NotFound) as requeue-able
305305
basePodGang, err := componentutils.GetPodGang(ctx, r.client, basePodGangName, namespace)
306306
if err != nil {
@@ -315,11 +315,9 @@ func (r _resource) isBasePodGangReady(ctx context.Context, logger logr.Logger, n
315315
// Each PodGroup represents a PodClique within the base PodGang and must meet its MinReplicas requirement
316316
for _, podGroup := range basePodGang.Spec.PodGroups {
317317
pclqName := podGroup.Name
318-
319-
// Get the PodClique
320318
pclq := &grovecorev1alpha1.PodClique{}
321319
pclqKey := client.ObjectKey{Name: pclqName, Namespace: namespace}
322-
if err := r.client.Get(ctx, pclqKey, pclq); err != nil {
320+
if err = r.client.Get(ctx, pclqKey, pclq); err != nil {
323321
// All errors (including NotFound) should trigger requeue for reliable retry
324322
// This ensures PodClique exists before we evaluate base PodGang readiness
325323
return false, groveerr.WrapError(err,
@@ -329,13 +327,11 @@ func (r _resource) isBasePodGangReady(ctx context.Context, logger logr.Logger, n
329327
)
330328
}
331329

332-
// CRITICAL READINESS CHECK: Compare actual ready pods vs required minimum
333-
// If ANY PodClique in the base PodGang fails this check, the entire base is considered not ready
334-
if pclq.Status.ReadyReplicas < podGroup.MinReplicas {
335-
logger.Info("Base PodGang not ready: PodClique has insufficient ready replicas",
330+
if pclq.Status.ScheduledReplicas < podGroup.MinReplicas {
331+
logger.Info("Base PodGang not scheduled: PodClique has insufficient scheduled replicas",
336332
"basePodGangName", basePodGangName,
337333
"pclqName", pclqName,
338-
"readyReplicas", pclq.Status.ReadyReplicas,
334+
"scheduledReplicas", pclq.Status.ScheduledReplicas,
339335
"minReplicas", podGroup.MinReplicas)
340336
return false, nil // Not ready, but no error - legitimate state
341337
}
@@ -345,22 +341,22 @@ func (r _resource) isBasePodGangReady(ctx context.Context, logger logr.Logger, n
345341
return true, nil
346342
}
347343

348-
// checkBasePodGangReadinessForPodClique determines if there's a base PodGang that needs to be checked
349-
// for readiness, and if so, performs that check once for the entire PodClique.
350-
func (r _resource) checkBasePodGangReadinessForPodClique(ctx context.Context, logger logr.Logger, pclq *grovecorev1alpha1.PodClique) (bool, string, error) {
344+
// checkBasePodGangScheduledForPodClique determines if there's a base PodGang for the PodClique. If there is one,
345+
// this function checks if it is scheduled.
346+
func (r _resource) checkBasePodGangScheduledForPodClique(ctx context.Context, logger logr.Logger, pclq *grovecorev1alpha1.PodClique) (bool, string, error) {
351347
// Check if this PodClique has a base PodGang dependency
352348
basePodGangName, hasBasePodGangLabel := pclq.GetLabels()[common.LabelBasePodGang]
353349
if !hasBasePodGangLabel {
354350
// This PodClique is a base PodGang itself - no dependency
355351
return true, "", nil
356352
}
357353

358-
ready, err := r.isBasePodGangReady(ctx, logger, pclq.Namespace, basePodGangName)
354+
scheduled, err := r.isBasePodGangScheduled(ctx, logger, pclq.Namespace, basePodGangName)
359355
if err != nil {
360356
return false, basePodGangName, err
361357
}
362358

363-
return ready, basePodGangName, nil
359+
return scheduled, basePodGangName, nil
364360
}
365361

366362
// shouldSkipPodSchedulingGateRemoval implements the core PodGang scheduling gate logic.

operator/internal/controller/podclique/components/pod/syncflow_test.go

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ func TestCheckAndRemovePodSchedulingGates_MinAvailableAware(t *testing.T) {
194194
}
195195

196196
// For scaled PodGang tests, add the base PodGang label to the PodClique
197-
// This is what the production code expects to read in checkBasePodGangReadinessForPodClique
197+
// This is what the production code expects to read in checkBasePodGangScheduledForPodClique
198198
if isScaledPodGangTest {
199199
testPclq.Labels = map[string]string{
200200
common.LabelBasePodGang: "simple1-0",
@@ -320,54 +320,54 @@ func TestCheckAndRemovePodSchedulingGates_ConcurrentExecution(t *testing.T) {
320320
}
321321
}
322322

323-
func TestIsBasePodGangReady(t *testing.T) {
323+
func TestIsBasePodGangScheduled(t *testing.T) {
324324
tests := []struct {
325325
name string
326326
basePodGangExists bool
327327
podCliques []testPodClique
328-
expectedReady bool
328+
expectedScheduled bool
329329
expectError bool
330330
description string
331331
}{
332332
{
333-
name: "Base PodGang ready - all PodCliques meet MinAvailable",
333+
name: "Base PodGang is scheduled - all PodCliques meet MinAvailable",
334334
basePodGangExists: true,
335335
podCliques: []testPodClique{
336-
{name: "simple1-0-pcb", minAvailable: 2, readyReplicas: 2},
337-
{name: "simple1-0-pcc", minAvailable: 1, readyReplicas: 3},
336+
{name: "simple1-0-pcb", minAvailable: 2, scheduledReplicas: 2},
337+
{name: "simple1-0-pcc", minAvailable: 1, scheduledReplicas: 3},
338338
},
339-
expectedReady: true,
340-
expectError: false,
341-
description: "All PodCliques meet their MinAvailable requirements",
339+
expectedScheduled: true,
340+
expectError: false,
341+
description: "All PodCliques meet their MinAvailable requirements",
342342
},
343343
{
344-
name: "Base PodGang not ready - one PodClique below MinAvailable",
344+
name: "Base PodGang not scheduled - one PodClique below MinAvailable",
345345
basePodGangExists: true,
346346
podCliques: []testPodClique{
347-
{name: "simple1-0-pcb", minAvailable: 2, readyReplicas: 2},
348-
{name: "simple1-0-pcc", minAvailable: 3, readyReplicas: 2}, // Below MinAvailable
347+
{name: "simple1-0-pcb", minAvailable: 2, scheduledReplicas: 2},
348+
{name: "simple1-0-pcc", minAvailable: 3, scheduledReplicas: 2}, // Below MinAvailable
349349
},
350-
expectedReady: false,
351-
expectError: false,
352-
description: "One PodClique below MinAvailable makes base PodGang not ready",
350+
expectedScheduled: false,
351+
expectError: false,
352+
description: "One PodClique below MinAvailable makes base PodGang not scheduled",
353353
},
354354
{
355355
name: "Base PodGang missing",
356356
basePodGangExists: false,
357357
podCliques: []testPodClique{},
358-
expectedReady: false,
358+
expectedScheduled: false,
359359
expectError: true,
360360
description: "Missing base PodGang should return error for requeue",
361361
},
362362
{
363363
name: "Base PodGang ready - single PodClique",
364364
basePodGangExists: true,
365365
podCliques: []testPodClique{
366-
{name: "simple1-0-pcb", minAvailable: 1, readyReplicas: 1},
366+
{name: "simple1-0-pcb", minAvailable: 1, scheduledReplicas: 1},
367367
},
368-
expectedReady: true,
369-
expectError: false,
370-
description: "Single PodClique meeting MinAvailable",
368+
expectedScheduled: true,
369+
expectError: false,
370+
description: "Single PodClique meeting MinAvailable",
371371
},
372372
}
373373

@@ -402,7 +402,7 @@ func TestIsBasePodGangReady(t *testing.T) {
402402

403403
// Add corresponding PodCliques
404404
for _, pclq := range tt.podCliques {
405-
podClique := createTestPodClique(pclq.name, pclq.minAvailable, pclq.readyReplicas)
405+
podClique := createTestPodClique(pclq.name, pclq.minAvailable, pclq.scheduledReplicas)
406406
objects = append(objects, podClique)
407407
}
408408
}
@@ -415,15 +415,15 @@ func TestIsBasePodGangReady(t *testing.T) {
415415

416416
// Test the readiness check
417417
r := &_resource{client: fakeClient}
418-
result, err := r.isBasePodGangReady(context.Background(), logr.Discard(), "default", "simple1-0")
418+
result, err := r.isBasePodGangScheduled(context.Background(), logr.Discard(), "default", "simple1-0")
419419

420420
if tt.expectError {
421421
require.Error(t, err, "Expected error for test case: %s", tt.name)
422-
// When error occurs, result should be false and we don't check expectedReady
422+
// When error occurs, result should be false and we don't check expectedScheduled
423423
assert.False(t, result, "Result should be false when error occurs")
424424
} else {
425425
require.NoError(t, err, "Unexpected error for test case: %s", tt.name)
426-
assert.Equal(t, tt.expectedReady, result, tt.description)
426+
assert.Equal(t, tt.expectedScheduled, result, tt.description)
427427
}
428428
})
429429
}
@@ -432,9 +432,9 @@ func TestIsBasePodGangReady(t *testing.T) {
432432
// Test helper types and functions
433433

434434
type testPodClique struct {
435-
name string
436-
minAvailable int32
437-
readyReplicas int32
435+
name string
436+
minAvailable int32
437+
scheduledReplicas int32
438438
}
439439

440440
func createTestPod(podGangName string, hasGate bool, inPodGang bool) *corev1.Pod {
@@ -548,7 +548,7 @@ func createTestBasePodGangWithPodCliques(podCliques []testPodClique) *grovesched
548548
}
549549
}
550550

551-
func createTestPodClique(name string, minAvailable, readyReplicas int32) *grovecorev1alpha1.PodClique {
551+
func createTestPodClique(name string, minAvailable, scheduledReplicas int32) *grovecorev1alpha1.PodClique {
552552
return &grovecorev1alpha1.PodClique{
553553
ObjectMeta: metav1.ObjectMeta{
554554
Name: name,
@@ -558,7 +558,7 @@ func createTestPodClique(name string, minAvailable, readyReplicas int32) *grovec
558558
MinAvailable: ptr.To(minAvailable),
559559
},
560560
Status: grovecorev1alpha1.PodCliqueStatus{
561-
ReadyReplicas: readyReplicas,
561+
ScheduledReplicas: scheduledReplicas,
562562
},
563563
}
564564
}

0 commit comments

Comments
 (0)