diff --git a/pkg/controllers/applicationfailover/common.go b/pkg/controllers/applicationfailover/common.go index 11178a83ec26..1737c9718f41 100644 --- a/pkg/controllers/applicationfailover/common.go +++ b/pkg/controllers/applicationfailover/common.go @@ -106,9 +106,11 @@ func (m *workloadUnhealthyMap) deleteIrrelevantClusters(key types.NamespacedName m.workloadUnhealthy[key] = unhealthyClusters } -// distinguishUnhealthyClustersWithOthers distinguishes clusters which is in the unHealthy state(not in the process of eviction) with others. +// distinguishUnhealthyClustersWithOthers distinguishes unhealthy clusters (not in the process of eviction) from healthy ones. +// ResourceUnknown clusters are intentionally excluded from both lists to preserve their failover timers across transient +// status collection gaps. func distinguishUnhealthyClustersWithOthers(aggregatedStatusItems []workv1alpha2.AggregatedStatusItem, resourceBindingSpec workv1alpha2.ResourceBindingSpec) ([]string, []string) { - var unhealthyClusters, others []string + var unhealthyClusters, healthyClusters []string for _, aggregatedStatusItem := range aggregatedStatusItems { cluster := aggregatedStatusItem.ClusterName @@ -116,12 +118,12 @@ func distinguishUnhealthyClustersWithOthers(aggregatedStatusItems []workv1alpha2 unhealthyClusters = append(unhealthyClusters, cluster) } - if aggregatedStatusItem.Health == workv1alpha2.ResourceHealthy || aggregatedStatusItem.Health == workv1alpha2.ResourceUnknown { - others = append(others, cluster) + if aggregatedStatusItem.Health == workv1alpha2.ResourceHealthy { + healthyClusters = append(healthyClusters, cluster) } } - return unhealthyClusters, others + return unhealthyClusters, healthyClusters } func getClusterNamesFromTargetClusters(targetClusters []workv1alpha2.TargetCluster) []string { diff --git a/pkg/controllers/applicationfailover/common_test.go b/pkg/controllers/applicationfailover/common_test.go index ee66641a0c7b..f5c4aab15a45 100644 --- a/pkg/controllers/applicationfailover/common_test.go +++ b/pkg/controllers/applicationfailover/common_test.go @@ -99,11 +99,11 @@ func TestWorkloadUnhealthyMap_deleteIrrelevantClusters(t *testing.T) { func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) { tests := []struct { - name string - aggregatedStatusItems []workv1alpha2.AggregatedStatusItem - resourceBindingSpec workv1alpha2.ResourceBindingSpec - expectedClusters []string - expectedOthers []string + name string + aggregatedStatusItems []workv1alpha2.AggregatedStatusItem + resourceBindingSpec workv1alpha2.ResourceBindingSpec + expectedClusters []string + expectedHealthyClusters []string }{ { name: "all applications are healthy", @@ -117,9 +117,9 @@ func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) { Health: workv1alpha2.ResourceHealthy, }, }, - resourceBindingSpec: workv1alpha2.ResourceBindingSpec{}, - expectedClusters: nil, - expectedOthers: []string{"member1", "member2"}, + resourceBindingSpec: workv1alpha2.ResourceBindingSpec{}, + expectedClusters: nil, + expectedHealthyClusters: []string{"member1", "member2"}, }, { name: "all applications are unknown", @@ -133,9 +133,9 @@ func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) { Health: workv1alpha2.ResourceUnknown, }, }, - resourceBindingSpec: workv1alpha2.ResourceBindingSpec{}, - expectedClusters: nil, - expectedOthers: []string{"member1", "member2"}, + resourceBindingSpec: workv1alpha2.ResourceBindingSpec{}, + expectedClusters: nil, + expectedHealthyClusters: nil, }, { name: "one application is unhealthy and not in gracefulEvictionTasks", @@ -149,9 +149,9 @@ func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) { Health: workv1alpha2.ResourceUnhealthy, }, }, - resourceBindingSpec: workv1alpha2.ResourceBindingSpec{}, - expectedClusters: []string{"member2"}, - expectedOthers: []string{"member1"}, + resourceBindingSpec: workv1alpha2.ResourceBindingSpec{}, + expectedClusters: []string{"member2"}, + expectedHealthyClusters: []string{"member1"}, }, { name: "one application is unhealthy and in gracefulEvictionTasks", @@ -172,20 +172,45 @@ func TestDistinguishUnhealthyClustersWithOthers(t *testing.T) { }, }, }, - expectedClusters: nil, - expectedOthers: []string{"member1"}, + expectedClusters: nil, + expectedHealthyClusters: []string{"member1"}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got, gotOthers := distinguishUnhealthyClustersWithOthers(tt.aggregatedStatusItems, tt.resourceBindingSpec); !reflect.DeepEqual(got, tt.expectedClusters) || !reflect.DeepEqual(gotOthers, tt.expectedOthers) { - t.Errorf("distinguishUnhealthyClustersWithOthers() = (%v, %v), want (%v, %v)", got, gotOthers, tt.expectedClusters, tt.expectedOthers) + if got, gotHealthyClusters := distinguishUnhealthyClustersWithOthers(tt.aggregatedStatusItems, tt.resourceBindingSpec); !reflect.DeepEqual(got, tt.expectedClusters) || !reflect.DeepEqual(gotHealthyClusters, tt.expectedHealthyClusters) { + t.Errorf("distinguishUnhealthyClustersWithOthers() = (%v, %v), want (%v, %v)", got, gotHealthyClusters, tt.expectedClusters, tt.expectedHealthyClusters) } }) } } +func TestPreserveTimestampAcrossUnknownTransitions(t *testing.T) { + key := types.NamespacedName{Namespace: "default", Name: "test"} + cluster := "member1" + allClusters := sets.New[string](cluster) + + m := newWorkloadUnhealthyMap() + + // T=0: cluster first observed unhealthy; toleration window starts. + m.setTimeStamp(key, cluster) + originalTimestamp := m.getTimeStamp(key, cluster) + assert.True(t, m.hasWorkloadBeenUnhealthy(key, cluster)) + + // Health transitions to Unknown (e.g. karmada-agent restarts). + // With the fix, Unknown clusters must not appear in healthyClusters. + _, healthyClusters := distinguishUnhealthyClustersWithOthers( + []workv1alpha2.AggregatedStatusItem{{ClusterName: cluster, Health: workv1alpha2.ResourceUnknown}}, + workv1alpha2.ResourceBindingSpec{}, + ) + m.deleteIrrelevantClusters(key, allClusters, healthyClusters) + + // Timestamp must be preserved so the toleration window continues from T=0. + assert.True(t, m.hasWorkloadBeenUnhealthy(key, cluster), "failover timer must survive Unknown health transitions") + assert.Equal(t, originalTimestamp, m.getTimeStamp(key, cluster), "failover timestamp must not be reset by Unknown health transitions") +} + func Test_getClusterNamesFromTargetClusters(t *testing.T) { type args struct { targetClusters []workv1alpha2.TargetCluster diff --git a/pkg/controllers/applicationfailover/crb_application_failover_controller.go b/pkg/controllers/applicationfailover/crb_application_failover_controller.go index 839bdcb43416..edffc1ba92fe 100644 --- a/pkg/controllers/applicationfailover/crb_application_failover_controller.go +++ b/pkg/controllers/applicationfailover/crb_application_failover_controller.go @@ -127,7 +127,7 @@ func (c *CRBApplicationFailoverController) syncBinding(ctx context.Context, bind allClusters.Insert(cluster.Name) } - unhealthyClusters, others := distinguishUnhealthyClustersWithOthers(binding.Status.AggregatedStatus, binding.Spec) + unhealthyClusters, healthyClusters := distinguishUnhealthyClustersWithOthers(binding.Status.AggregatedStatus, binding.Spec) duration, needEvictClusters := c.detectFailure(unhealthyClusters, tolerationSeconds, key) err := c.evictBinding(binding, needEvictClusters) @@ -143,7 +143,7 @@ func (c *CRBApplicationFailoverController) syncBinding(ctx context.Context, bind } // Cleanup clusters on which the application status is not unhealthy and clusters that have been evicted or removed in the workloadUnhealthyMap. - c.workloadUnhealthyMap.deleteIrrelevantClusters(key, allClusters, others) + c.workloadUnhealthyMap.deleteIrrelevantClusters(key, allClusters, healthyClusters) return time.Duration(duration) * time.Second, nil } diff --git a/pkg/controllers/applicationfailover/rb_application_failover_controller.go b/pkg/controllers/applicationfailover/rb_application_failover_controller.go index 463f4558314d..2cc3677a06ee 100644 --- a/pkg/controllers/applicationfailover/rb_application_failover_controller.go +++ b/pkg/controllers/applicationfailover/rb_application_failover_controller.go @@ -127,7 +127,7 @@ func (c *RBApplicationFailoverController) syncBinding(ctx context.Context, bindi allClusters.Insert(cluster.Name) } - unhealthyClusters, others := distinguishUnhealthyClustersWithOthers(binding.Status.AggregatedStatus, binding.Spec) + unhealthyClusters, healthyClusters := distinguishUnhealthyClustersWithOthers(binding.Status.AggregatedStatus, binding.Spec) duration, needEvictClusters := c.detectFailure(unhealthyClusters, tolerationSeconds, key) err := c.evictBinding(binding, needEvictClusters) @@ -143,7 +143,7 @@ func (c *RBApplicationFailoverController) syncBinding(ctx context.Context, bindi } // Cleanup clusters on which the application status is not unhealthy and clusters that have been evicted or removed in the workloadUnhealthyMap. - c.workloadUnhealthyMap.deleteIrrelevantClusters(key, allClusters, others) + c.workloadUnhealthyMap.deleteIrrelevantClusters(key, allClusters, healthyClusters) return time.Duration(duration) * time.Second, nil }