Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 59 additions & 27 deletions operator/e2e/tests/rolling_updates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@
package tests

import (
"context"
"fmt"
"testing"
"time"

"github.com/ai-dynamo/grove/operator/e2e/utils"
"k8s.io/apimachinery/pkg/watch"
)

// Test_RU7_RollingUpdatePCSPodClique tests rolling update when PCS-owned Podclique spec is updated
Expand Down Expand Up @@ -100,7 +105,6 @@ func Test_RU8_RollingUpdatePCSGPodClique(t *testing.T) {
logger.Info("🎉 Rolling Update on PCSG-owned Podclique test (RU-8) completed successfully!")
}

/* This test is flaky. It sometimes fails with "Failed to wait for rolling update to complete: condition not met..."
// Test_RU9_RollingUpdateAllPodCliques tests rolling update when all Podclique specs are updated
// Scenario RU-9:
// 1. Initialize a 10-node Grove cluster
Expand Down Expand Up @@ -140,17 +144,18 @@ func Test_RU9_RollingUpdateAllPodCliques(t *testing.T) {

logger.Info("🎉 Rolling Update on all Podcliques test (RU-9) completed successfully!")
}
*/

/* This test fails. The rolling update starts, a pod gets deleted.
// Test_RU10_RollingUpdateInsufficientResources tests rolling update with insufficient resources
// Test_RU10_RollingUpdateInsufficientResources tests rolling update behavior with insufficient resources.
// This test verifies the "delete-first" strategy: when nodes are cordoned, the operator deletes
// one old pod and creates a new one (which remains Pending). No further progress is made until
// nodes are uncordoned, at which point the rolling update completes.
// Scenario RU-10:
// 1. Initialize a 10-node Grove cluster
// 2. Deploy workload WL1, and verify 10 newly created pods
// 3. Cordon all worker nodes
// 4. Change the specification of pc-a
// 5. Verify the rolling update does not progress due to insufficient resources
// 6. Uncordon the nodes, and verify the rolling update continues
// 5. Verify exactly one pod is deleted and a new Pending pod is created (delete-first strategy)
// 6. Uncordon the nodes, and verify the rolling update completes
func Test_RU10_RollingUpdateInsufficientResources(t *testing.T) {
ctx := context.Background()

Expand Down Expand Up @@ -213,43 +218,70 @@ func Test_RU10_RollingUpdateInsufficientResources(t *testing.T) {
defer tracker.Stop()

logger.Info("4. Change the specification of pc-a")
err = triggerPodCliqueRollingUpdate(ctx, dynamicClient, tc.Namespace, "workload1", "pc-a")
if err != nil {
if err := triggerPodCliqueRollingUpdate(tc, "pc-a"); err != nil {
t.Fatalf("Failed to update PodClique spec: %v", err)
}

logger.Info("5. Verify the rolling update does not progress due to insufficient resources")
time.Sleep(1 * time.Minute)

// Verify that none of the existing pods were deleted during the insufficient resources period
events := tracker.getEvents()
var deletedExistingPods []string
for _, event := range events {
switch event.Type {
case watch.Deleted:
if existingPodNames[event.Pod.Name] {
deletedExistingPods = append(deletedExistingPods, event.Pod.Name)
logger.Debugf("Existing pod deleted during insufficient resources: %s", event.Pod.Name)
logger.Info("5. Verify exactly one pod is deleted and a new Pending pod is created (delete-first strategy)")

// Poll until we see exactly 1 pod deleted and 1 new pod created, verifying delete-first behavior
pollErr := utils.PollForCondition(ctx, 2*time.Minute, 2*time.Second, func() (bool, error) {
events := tracker.getEvents()
var deletedExistingPods []string
var addedPods []string

for _, event := range events {
switch event.Type {
case watch.Deleted:
if existingPodNames[event.Pod.Name] {
deletedExistingPods = append(deletedExistingPods, event.Pod.Name)
logger.Debugf("Existing pod deleted during rolling update: %s", event.Pod.Name)
}
case watch.Added:
if !existingPodNames[event.Pod.Name] {
addedPods = append(addedPods, event.Pod.Name)
logger.Debugf("New pod created during rolling update: %s", event.Pod.Name)
}
}
}
}

if len(deletedExistingPods) > 0 {
t.Fatalf("Rolling update progressed despite insufficient resources: %d existing pods deleted: %v",
len(deletedExistingPods), deletedExistingPods)
// Check if we've reached the expected delete-first state
deletedCount := len(deletedExistingPods)
addedCount := len(addedPods)

// If more than 1 pod was deleted, the test should fail (not delete-first)
if deletedCount > 1 {
return false, fmt.Errorf("rolling update progressed beyond first deletion - expected 1 pod deleted but got %d: %v (not delete-first strategy)", deletedCount, deletedExistingPods)
}

// Success: exactly 1 deleted and 1 new pod created
if deletedCount == 1 && addedCount == 1 {
logger.Infof("✅ Delete-first strategy verified: 1 pod deleted (%v), 1 new pod created (%v)", deletedExistingPods, addedPods)
return true, nil
}

// Still waiting for the condition
logger.Debugf("Waiting for delete-first state: deleted=%d (want 1), added=%d (want 1)", deletedCount, addedCount)
return false, nil
})

if pollErr != nil {
t.Fatalf("Failed to verify delete-first strategy: %v", pollErr)
}

logger.Info("6. Uncordon the nodes, and verify the rolling update continues")
logger.Info("6. Uncordon the nodes, and verify the rolling update completes")
uncordonNodes(tc, workerNodes)

// Wait for rolling update to complete after uncordoning
if err := waitForRollingUpdateComplete(ctx, dynamicClient, tc.Namespace, "workload1", 1, 1*time.Minute); err != nil {
tcLongTimeout := tc
tcLongTimeout.Timeout = 5 * time.Minute
if err := waitForRollingUpdateComplete(tcLongTimeout, 1); err != nil {
logger.Info("=== Rolling update timed out - capturing debug info ===")
t.Fatalf("Failed to wait for rolling update to complete: %v", err)
}

logger.Info("🎉 Rolling Update with insufficient resources test (RU-10) completed successfully!")
}
*/

// Test_RU11_RollingUpdateWithPCSScaleOut tests rolling update with scale-out on PCS
// Scenario RU-11:
Expand Down
9 changes: 5 additions & 4 deletions operator/internal/controller/podclique/reconcilestatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,13 @@ func mutateReplicas(pclq *grovecorev1alpha1.PodClique, podCategories map[corev1.
// mutateUpdatedReplica calculates and sets the number of pods with the expected template hash
func mutateUpdatedReplica(pclq *grovecorev1alpha1.PodClique, existingPods []*corev1.Pod) {
var expectedPodTemplateHash string
// If the PCLQ update is in progress then take the expected PodTemplateHash from the PodClique.Status.RollingUpdateProgress.PodCliqueSetGenerationHash field
// else take it from the PodClique.Status.CurrentPodTemplateHash field
if componentutils.IsPCLQUpdateInProgress(pclq) {
// If RollingUpdateProgress exists (update in progress or recently completed), use the target hash from it.
// This covers both the active update phase and the window after completion before CurrentPodTemplateHash is synced.
if pclq.Status.RollingUpdateProgress != nil {
expectedPodTemplateHash = pclq.Status.RollingUpdateProgress.PodTemplateHash
} else if pclq.Status.CurrentPodTemplateHash != nil {
// CurrentPodTemplateHash will be set if RollingUpdateProgress is nil or if the last update has completed.
// Steady state: no rolling update tracking exists.
// Use the stable current hash for pods that have been reconciled.
expectedPodTemplateHash = *pclq.Status.CurrentPodTemplateHash
}
// If expectedPodTemplateHash is empty, it means that the PCLQ has never been successfully reconciled and therefore no pods should be considered as updated.
Expand Down
195 changes: 195 additions & 0 deletions operator/internal/controller/podclique/reconcilestatus_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
// /*
// Copyright 2025 The Grove Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// */

package podclique

import (
"testing"
"time"

apicommon "github.com/ai-dynamo/grove/operator/api/common"
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"

"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
)

// TestMutateUpdatedReplica tests the mutateUpdatedReplica function across different PodClique states
func TestMutateUpdatedReplica(t *testing.T) {
tests := []struct {
name string
pclq *grovecorev1alpha1.PodClique
existingPods []*corev1.Pod
expectedUpdatedReplicas int32
}{
{
name: "rolling update in progress - count pods with new hash",
pclq: &grovecorev1alpha1.PodClique{
Status: grovecorev1alpha1.PodCliqueStatus{
RollingUpdateProgress: &grovecorev1alpha1.PodCliqueRollingUpdateProgress{
PodTemplateHash: "new-hash-v2",
UpdateStartedAt: metav1.Time{Time: time.Now()},
UpdateEndedAt: nil, // Still in progress
},
CurrentPodTemplateHash: ptr.To("old-hash-v1"),
},
},
existingPods: []*corev1.Pod{
// 3 pods updated to new version
createPodWithHash("pod-1", "new-hash-v2"),
createPodWithHash("pod-2", "new-hash-v2"),
createPodWithHash("pod-3", "new-hash-v2"),
// 7 pods still on old version
createPodWithHash("pod-4", "old-hash-v1"),
createPodWithHash("pod-5", "old-hash-v1"),
createPodWithHash("pod-6", "old-hash-v1"),
createPodWithHash("pod-7", "old-hash-v1"),
createPodWithHash("pod-8", "old-hash-v1"),
createPodWithHash("pod-9", "old-hash-v1"),
createPodWithHash("pod-10", "old-hash-v1"),
},
expectedUpdatedReplicas: 3, // Only the 3 pods with new hash
},
{
name: "update just completed - use RollingUpdateProgress hash (edge case)",
pclq: &grovecorev1alpha1.PodClique{
Status: grovecorev1alpha1.PodCliqueStatus{
RollingUpdateProgress: &grovecorev1alpha1.PodCliqueRollingUpdateProgress{
PodTemplateHash: "new-hash-v2",
UpdateStartedAt: metav1.Time{Time: time.Now().Add(-5 * time.Minute)},
UpdateEndedAt: &metav1.Time{Time: time.Now()}, // Just completed
},
// CurrentPodTemplateHash not updated yet - still has old hash
CurrentPodTemplateHash: ptr.To("old-hash-v1"),
},
},
existingPods: []*corev1.Pod{
// All pods updated to new version
createPodWithHash("pod-1", "new-hash-v2"),
createPodWithHash("pod-2", "new-hash-v2"),
createPodWithHash("pod-3", "new-hash-v2"),
createPodWithHash("pod-4", "new-hash-v2"),
createPodWithHash("pod-5", "new-hash-v2"),
createPodWithHash("pod-6", "new-hash-v2"),
createPodWithHash("pod-7", "new-hash-v2"),
createPodWithHash("pod-8", "new-hash-v2"),
createPodWithHash("pod-9", "new-hash-v2"),
createPodWithHash("pod-10", "new-hash-v2"),
},
expectedUpdatedReplicas: 10, // All 10 pods should be counted as updated
},
{
name: "steady state - no rolling update",
pclq: &grovecorev1alpha1.PodClique{
Status: grovecorev1alpha1.PodCliqueStatus{
RollingUpdateProgress: nil, // No rolling update
CurrentPodTemplateHash: ptr.To("stable-hash"),
},
},
existingPods: []*corev1.Pod{
createPodWithHash("pod-1", "stable-hash"),
createPodWithHash("pod-2", "stable-hash"),
createPodWithHash("pod-3", "stable-hash"),
createPodWithHash("pod-4", "stable-hash"),
createPodWithHash("pod-5", "stable-hash"),
},
expectedUpdatedReplicas: 5, // All pods match current hash
},
{
name: "never reconciled - empty hash",
pclq: &grovecorev1alpha1.PodClique{
Status: grovecorev1alpha1.PodCliqueStatus{
RollingUpdateProgress: nil,
CurrentPodTemplateHash: nil, // Never set
},
},
existingPods: []*corev1.Pod{
createPodWithHash("pod-1", "some-hash"),
createPodWithHash("pod-2", "some-hash"),
},
expectedUpdatedReplicas: 0, // Should not count any pods when hash is unknown - no rolling update
},
{
name: "mixed state - some pods match, some don't",
pclq: &grovecorev1alpha1.PodClique{
Status: grovecorev1alpha1.PodCliqueStatus{
RollingUpdateProgress: nil,
CurrentPodTemplateHash: ptr.To("current-hash"),
},
},
existingPods: []*corev1.Pod{
createPodWithHash("pod-1", "current-hash"),
createPodWithHash("pod-2", "current-hash"),
createPodWithHash("pod-3", "old-hash"),
createPodWithHash("pod-4", "old-hash"),
createPodWithHash("pod-5", "old-hash"),
},
expectedUpdatedReplicas: 2, // Only pods with current hash
},
{
name: "no pods exist",
pclq: &grovecorev1alpha1.PodClique{
Status: grovecorev1alpha1.PodCliqueStatus{
CurrentPodTemplateHash: ptr.To("some-hash"),
},
},
existingPods: []*corev1.Pod{},
expectedUpdatedReplicas: 0,
},
{
name: "rolling update progress exists but pods have different hash",
pclq: &grovecorev1alpha1.PodClique{
Status: grovecorev1alpha1.PodCliqueStatus{
RollingUpdateProgress: &grovecorev1alpha1.PodCliqueRollingUpdateProgress{
PodTemplateHash: "target-hash",
UpdateStartedAt: metav1.Time{Time: time.Now()},
},
CurrentPodTemplateHash: ptr.To("old-hash"),
},
},
existingPods: []*corev1.Pod{
createPodWithHash("pod-1", "completely-different-hash"),
createPodWithHash("pod-2", "another-hash"),
},
expectedUpdatedReplicas: 0, // None match the target hash
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Call the function
mutateUpdatedReplica(tt.pclq, tt.existingPods)

// Assert the result
assert.Equal(t, tt.expectedUpdatedReplicas, tt.pclq.Status.UpdatedReplicas,
"UpdatedReplicas should match expected value")
})
}
}

// createPodWithHash creates a test pod with the specified template hash label
func createPodWithHash(name string, templateHash string) *corev1.Pod {
return &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
apicommon.LabelPodTemplateHash: templateHash,
},
},
}
}
Loading