Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions pkg/controllers/applicationfailover/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,22 +106,24 @@ func (m *workloadUnhealthyMap) deleteIrrelevantClusters(key types.NamespacedName
m.workloadUnhealthy[key] = unhealthyClusters
}

// distinguishUnhealthyClustersWithOthers distinguishes clusters which is in the unHealthy state(not in the process of eviction) with others.
// distinguishUnhealthyClustersWithOthers distinguishes unhealthy clusters (not in the process of eviction) from healthy ones.
// ResourceUnknown clusters are intentionally excluded from both lists to preserve their failover timers across transient
// status collection gaps.
func distinguishUnhealthyClustersWithOthers(aggregatedStatusItems []workv1alpha2.AggregatedStatusItem, resourceBindingSpec workv1alpha2.ResourceBindingSpec) ([]string, []string) {
var unhealthyClusters, others []string
var unhealthyClusters, healthyClusters []string
for _, aggregatedStatusItem := range aggregatedStatusItems {
cluster := aggregatedStatusItem.ClusterName

if aggregatedStatusItem.Health == workv1alpha2.ResourceUnhealthy && !resourceBindingSpec.ClusterInGracefulEvictionTasks(cluster) {
unhealthyClusters = append(unhealthyClusters, cluster)
}

if aggregatedStatusItem.Health == workv1alpha2.ResourceHealthy || aggregatedStatusItem.Health == workv1alpha2.ResourceUnknown {
others = append(others, cluster)
if aggregatedStatusItem.Health == workv1alpha2.ResourceHealthy {
healthyClusters = append(healthyClusters, cluster)
}
}

return unhealthyClusters, others
return unhealthyClusters, healthyClusters
}

func getClusterNamesFromTargetClusters(targetClusters []workv1alpha2.TargetCluster) []string {
Expand Down
61 changes: 43 additions & 18 deletions pkg/controllers/applicationfailover/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ func TestWorkloadUnhealthyMap_deleteIrrelevantClusters(t *testing.T) {

func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) {
tests := []struct {
name string
aggregatedStatusItems []workv1alpha2.AggregatedStatusItem
resourceBindingSpec workv1alpha2.ResourceBindingSpec
expectedClusters []string
expectedOthers []string
name string
aggregatedStatusItems []workv1alpha2.AggregatedStatusItem
resourceBindingSpec workv1alpha2.ResourceBindingSpec
expectedClusters []string
expectedHealthyClusters []string
}{
{
name: "all applications are healthy",
Expand All @@ -117,9 +117,9 @@ func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) {
Health: workv1alpha2.ResourceHealthy,
},
},
resourceBindingSpec: workv1alpha2.ResourceBindingSpec{},
expectedClusters: nil,
expectedOthers: []string{"member1", "member2"},
resourceBindingSpec: workv1alpha2.ResourceBindingSpec{},
expectedClusters: nil,
expectedHealthyClusters: []string{"member1", "member2"},
},
{
name: "all applications are unknown",
Expand All @@ -133,9 +133,9 @@ func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) {
Health: workv1alpha2.ResourceUnknown,
},
},
resourceBindingSpec: workv1alpha2.ResourceBindingSpec{},
expectedClusters: nil,
expectedOthers: []string{"member1", "member2"},
resourceBindingSpec: workv1alpha2.ResourceBindingSpec{},
expectedClusters: nil,
expectedHealthyClusters: nil,
},
{
name: "one application is unhealthy and not in gracefulEvictionTasks",
Expand All @@ -149,9 +149,9 @@ func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) {
Health: workv1alpha2.ResourceUnhealthy,
},
},
resourceBindingSpec: workv1alpha2.ResourceBindingSpec{},
expectedClusters: []string{"member2"},
expectedOthers: []string{"member1"},
resourceBindingSpec: workv1alpha2.ResourceBindingSpec{},
expectedClusters: []string{"member2"},
expectedHealthyClusters: []string{"member1"},
},
{
name: "one application is unhealthy and in gracefulEvictionTasks",
Expand All @@ -172,20 +172,45 @@ func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) {
},
},
},
expectedClusters: nil,
expectedOthers: []string{"member1"},
expectedClusters: nil,
expectedHealthyClusters: []string{"member1"},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got, gotOthers := distinguishUnhealthyClustersWithOthers(tt.aggregatedStatusItems, tt.resourceBindingSpec); !reflect.DeepEqual(got, tt.expectedClusters) || !reflect.DeepEqual(gotOthers, tt.expectedOthers) {
t.Errorf("distinguishUnhealthyClustersWithOthers() = (%v, %v), want (%v, %v)", got, gotOthers, tt.expectedClusters, tt.expectedOthers)
if got, gotHealthyClusters := distinguishUnhealthyClustersWithOthers(tt.aggregatedStatusItems, tt.resourceBindingSpec); !reflect.DeepEqual(got, tt.expectedClusters) || !reflect.DeepEqual(gotHealthyClusters, tt.expectedHealthyClusters) {
t.Errorf("distinguishUnhealthyClustersWithOthers() = (%v, %v), want (%v, %v)", got, gotHealthyClusters, tt.expectedClusters, tt.expectedHealthyClusters)
}
})
}
}

func TestPreserveTimestampAcrossUnknownTransitions(t *testing.T) {
key := types.NamespacedName{Namespace: "default", Name: "test"}
cluster := "member1"
allClusters := sets.New[string](cluster)

m := newWorkloadUnhealthyMap()

// T=0: cluster first observed unhealthy; toleration window starts.
m.setTimeStamp(key, cluster)
originalTimestamp := m.getTimeStamp(key, cluster)
assert.True(t, m.hasWorkloadBeenUnhealthy(key, cluster))

// Health transitions to Unknown (e.g. karmada-agent restarts).
// With the fix, Unknown clusters must not appear in healthyClusters.
_, healthyClusters := distinguishUnhealthyClustersWithOthers(
[]workv1alpha2.AggregatedStatusItem{{ClusterName: cluster, Health: workv1alpha2.ResourceUnknown}},
workv1alpha2.ResourceBindingSpec{},
)
m.deleteIrrelevantClusters(key, allClusters, healthyClusters)

// Timestamp must be preserved so the toleration window continues from T=0.
assert.True(t, m.hasWorkloadBeenUnhealthy(key, cluster), "failover timer must survive Unknown health transitions")
assert.Equal(t, originalTimestamp, m.getTimeStamp(key, cluster), "failover timestamp must not be reset by Unknown health transitions")
}

func Test_getClusterNamesFromTargetClusters(t *testing.T) {
type args struct {
targetClusters []workv1alpha2.TargetCluster
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func (c *CRBApplicationFailoverController) syncBinding(ctx context.Context, bind
allClusters.Insert(cluster.Name)
}

unhealthyClusters, others := distinguishUnhealthyClustersWithOthers(binding.Status.AggregatedStatus, binding.Spec)
unhealthyClusters, healthyClusters := distinguishUnhealthyClustersWithOthers(binding.Status.AggregatedStatus, binding.Spec)
duration, needEvictClusters := c.detectFailure(unhealthyClusters, tolerationSeconds, key)

err := c.evictBinding(binding, needEvictClusters)
Expand All @@ -143,7 +143,7 @@ func (c *CRBApplicationFailoverController) syncBinding(ctx context.Context, bind
}

// Cleanup clusters on which the application status is not unhealthy and clusters that have been evicted or removed in the workloadUnhealthyMap.
c.workloadUnhealthyMap.deleteIrrelevantClusters(key, allClusters, others)
c.workloadUnhealthyMap.deleteIrrelevantClusters(key, allClusters, healthyClusters)

return time.Duration(duration) * time.Second, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func (c *RBApplicationFailoverController) syncBinding(ctx context.Context, bindi
allClusters.Insert(cluster.Name)
}

unhealthyClusters, others := distinguishUnhealthyClustersWithOthers(binding.Status.AggregatedStatus, binding.Spec)
unhealthyClusters, healthyClusters := distinguishUnhealthyClustersWithOthers(binding.Status.AggregatedStatus, binding.Spec)
duration, needEvictClusters := c.detectFailure(unhealthyClusters, tolerationSeconds, key)

err := c.evictBinding(binding, needEvictClusters)
Expand All @@ -143,7 +143,7 @@ func (c *RBApplicationFailoverController) syncBinding(ctx context.Context, bindi
}

// Cleanup clusters on which the application status is not unhealthy and clusters that have been evicted or removed in the workloadUnhealthyMap.
c.workloadUnhealthyMap.deleteIrrelevantClusters(key, allClusters, others)
c.workloadUnhealthyMap.deleteIrrelevantClusters(key, allClusters, healthyClusters)

return time.Duration(duration) * time.Second, nil
}
Expand Down
Loading