fix(cluster): strip stale primary label during failover (cloudnative-pg#10409)

armru · leonardoce · mnencia · web-flow · commit 57436b28534c · 2026-04-20T19:04:08.000+02:00
When the operator initiates a failover, the old primary's pod keeps its `cnpg.io/instanceRole=primary` label until `ReconcileMetadata` runs. But `ReconcileMetadata` is skipped during the entire failover window (the `CurrentPrimary != TargetPrimary` guard returns early), so the `-rw` service keeps routing to the old primary. If the old primary comes back (e.g. after a temporary network partition), replicas reconnect through the `-rw` service, satisfy the sync quorum, and writes committed on the stale primary are lost to `pg_rewind`. Introduce a third value for the instance role label, `unhealthy`, and apply it to the old primary as soon as failover starts. Since neither the `-rw` nor the `-ro` service selector matches `unhealthy`, the pod is immediately isolated from all service traffic for the duration of the failover window. `ReconcileMetadata` restores the `replica` label once `CurrentPrimary == TargetPrimary`. The label is applied best-effort at the point where failover is initiated, and re-applied on every pass of the reconcile loop while the failover is in progress so transient API errors are retried automatically. Note: stripping the label removes the pod from the service Endpoints, but does not drop TCP connections already established by a replica's walreceiver. This fix closes the reconnection window; established connections must still be terminated by the Postgres-level promotion on the new primary. Closes cloudnative-pg#10403 Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Signed-off-by: Marco Nenciarini <marco.nenciarini@enterprisedb.com> Signed-off-by: Leonardo Cecchi <leonardo.cecchi@enterprisedb.com> Co-authored-by: Leonardo Cecchi <leonardo.cecchi@enterprisedb.com> Co-authored-by: Marco Nenciarini <marco.nenciarini@enterprisedb.com>
diff --git a/contribute/technical-architecture.md b/contribute/technical-architecture.md
@@ -214,9 +214,11 @@ Networking is purely label-driven. The Operator manages the
 
 2. **Replica:** Labeled as `replica`. The `-ro` Service selects these Pods.
 
-3. **Failover:** When a new primary is promoted via `pg_ctl promote`, the
-   Operator updates the labels. The Kubernetes API Server then automatically
-   updates the **Endpoints** for the respective Services.
+3. **Unhealthy:** Transient value applied to the old primary during a
+   failover or switchover, so that neither the `-rw` nor the `-ro`
+   Service selects it. Once the transition completes, the Operator
+   relabels the demoted instance as `replica` and the Kubernetes API
+   Server updates the Service **Endpoints** accordingly.
 
 ## Source Code Reference
 
diff --git a/docs/src/labels_annotations.md b/docs/src/labels_annotations.md
@@ -102,15 +102,21 @@ This label is available only on `VolumeSnapshot` resources.
   default users created by CloudNativePG (typically `postgres` and `app`).
 
 `role` - **deprecated**
-:  Whether the instance running in a pod is a `primary` or a `replica`.
-   This label is deprecated, you should use `cnpg.io/instanceRole` instead.
+:  Role of the instance running in a pod: `primary`, `replica`, or
+   `unhealthy`. The `unhealthy` value is transient: the operator sets
+   it on the old primary during a failover or switchover and clears it
+   automatically once the transition completes. This label is deprecated,
+   you should use `cnpg.io/instanceRole` instead.
 
 `cnpg.io/scheduled-backup`
 :  When available, name of the `ScheduledBackup` resource that created a given
    `Backup` object.
 
 `cnpg.io/instanceRole`
-: Whether the instance running in a pod is a `primary` or a `replica`.
+: Role of the instance running in a pod: `primary`, `replica`, or
+  `unhealthy`. The `unhealthy` value is transient: the operator sets
+  it on the old primary during a failover or switchover and clears it
+  automatically once the transition completes.
 
 `app.kubernetes.io/managed-by`
 : Name of the manager. It will always be `cloudnative-pg`.
diff --git a/internal/controller/cluster_controller.go b/internal/controller/cluster_controller.go
@@ -367,6 +367,18 @@ func (r *ClusterReconciler) reconcile(ctx context.Context, cluster *apiv1.Cluste
 
 	if cluster.Status.CurrentPrimary != "" &&
 		cluster.Status.CurrentPrimary != cluster.Status.TargetPrimary {
+		// Mark the old primary as unhealthy on every pass while failover is
+		// in progress. This retries each second until it succeeds,
+		// complementing the immediate best-effort attempt in replicas.go.
+		if err := r.markOldPrimaryAsUnhealthy(
+			ctx, cluster.Status.CurrentPrimary, resources.instances.Items,
+		); err != nil {
+			contextLogger.Warning(
+				"Failed to strip primary label from old primary, will retry",
+				"oldPrimary", cluster.Status.CurrentPrimary,
+				"error", err)
+		}
+
 		contextLogger.Info("There is a switchover or a failover "+
 			"in progress, waiting for the operation to complete",
 			"currentPrimary", cluster.Status.CurrentPrimary,
diff --git a/internal/controller/replicas.go b/internal/controller/replicas.go
@@ -140,6 +140,22 @@ func (r *ClusterReconciler) reconcileTargetPrimaryForNonReplicaCluster(
 		if err != nil {
 			return "", err
 		}
+
+		// Mark the old primary as unhealthy immediately when failover starts,
+		// removing it from both the -rw and -ro services. This prevents replicas
+		// from reconnecting to it (primary_conninfo uses <cluster>-rw) and
+		// satisfying the synchronous replication quorum on a stale primary.
+		// Best-effort: the failover must proceed even if this fails. The
+		// retryable call in the reconcile loop's failover guard will correct
+		// the label on subsequent passes.
+		if err := r.markOldPrimaryAsUnhealthy(
+			ctx,
+			cluster.Status.CurrentPrimary,
+			resources.instances.Items,
+		); err != nil {
+			contextLogger.Error(err, "Failed to strip primary label from old primary, continuing with failover",
+				"oldPrimary", cluster.Status.CurrentPrimary)
+		}
 	}
 
 	// Wait until all the WAL receivers are down. This is needed to avoid losing the WAL
@@ -180,6 +196,39 @@ func (r *ClusterReconciler) reconcileTargetPrimaryForNonReplicaCluster(
 	return mostAdvancedInstance.Pod.Name, r.setPrimaryInstance(ctx, cluster, mostAdvancedInstance.Pod.Name)
 }
 
+// markOldPrimaryAsUnhealthy labels the old primary pod as unhealthy when failover
+// starts, removing it from both the -rw and -ro service selectors until
+// ReconcileMetadata restores the correct label after promotion completes.
+func (r *ClusterReconciler) markOldPrimaryAsUnhealthy(
+	ctx context.Context,
+	oldPrimaryName string,
+	instances []corev1.Pod,
+) error {
+	contextLogger := log.FromContext(ctx)
+
+	idx := slices.IndexFunc(instances, func(pod corev1.Pod) bool {
+		return pod.Name == oldPrimaryName
+	})
+	if idx == -1 {
+		contextLogger.Warning(
+			"Old primary pod not found in managed instances, skipping label demotion",
+			"oldPrimary", oldPrimaryName)
+		return nil
+	}
+
+	oldPrimary := &instances[idx]
+	if role, _ := utils.GetInstanceRole(oldPrimary.Labels); role == specs.ClusterRoleLabelUnhealthy {
+		return nil
+	}
+
+	contextLogger.Info(
+		"Setting primary label to unhealthy in the old primary during failover",
+		"pod", oldPrimary.Name)
+	origPod := oldPrimary.DeepCopy()
+	utils.SetInstanceRole(&oldPrimary.ObjectMeta, specs.ClusterRoleLabelUnhealthy)
+	return r.Patch(ctx, oldPrimary, client.MergeFrom(origPod))
+}
+
 // isNodeUnschedulableOrBeingDrained checks if a node is currently being drained.
 // Copied from https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/7bacf2d36f397bd098b3388403e8759c480be7e5/cmd/hooks/prestop.go#L91
 //
@@ -334,6 +383,11 @@ func (r *ClusterReconciler) reconcileTargetPrimaryForReplicaCluster(
 		return "", err
 	}
 
+	// Unlike the non-replica path, we do not strip the old primary label here:
+	// a designated primary does not accept application writes via the -rw
+	// service, so the split-brain window #10403 guards against does not
+	// apply. The retryable call in the reconcile loop's failover guard still
+	// relabels the pod on its next pass.
 	return status.Items[0].Pod.Name, r.setPrimaryInstance(ctx, cluster, status.Items[0].Pod.Name)
 }
 
diff --git a/internal/controller/replicas_test.go b/internal/controller/replicas_test.go
@@ -20,11 +20,18 @@ SPDX-License-Identifier: Apache-2.0
 package controller
 
 import (
+	"context"
+	"fmt"
+
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+	"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
 
 	apiv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1"
 	"github.com/cloudnative-pg/cloudnative-pg/pkg/postgres"
+	"github.com/cloudnative-pg/cloudnative-pg/pkg/specs"
 	"github.com/cloudnative-pg/cloudnative-pg/pkg/utils"
 
 	. "github.com/onsi/ginkgo/v2"
@@ -129,6 +136,116 @@ var _ = Describe("Sacrificial Pod detection", func() {
 	})
 })
 
+var _ = Describe("markOldPrimaryAsUnhealthy", func() {
+	var env *testingEnvironment
+
+	BeforeEach(func() {
+		env = buildTestEnvironment()
+	})
+
+	makePod := func(name, namespace, role string) corev1.Pod {
+		pod := corev1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      name,
+				Namespace: namespace,
+				Labels:    map[string]string{},
+			},
+		}
+		if role != "" {
+			utils.SetInstanceRole(&pod.ObjectMeta, role)
+		}
+		return pod
+	}
+
+	It("changes the primary label from the old primary pod", func() {
+		ctx := context.Background()
+		namespace := newFakeNamespace(env.client)
+
+		primary := makePod("cluster-1", namespace, specs.ClusterRoleLabelPrimary)
+		replica1 := makePod("cluster-2", namespace, specs.ClusterRoleLabelReplica)
+		replica2 := makePod("cluster-3", namespace, specs.ClusterRoleLabelReplica)
+
+		for i, pod := range []corev1.Pod{primary, replica1, replica2} {
+			p := pod
+			Expect(env.client.Create(ctx, &p)).To(Succeed())
+			// refresh the local copy with server-assigned fields
+			if i == 0 {
+				primary = p
+			}
+		}
+
+		pods := []corev1.Pod{primary, replica1, replica2}
+
+		err := env.clusterReconciler.markOldPrimaryAsUnhealthy(ctx, "cluster-1", pods)
+		Expect(err).ToNot(HaveOccurred())
+
+		// Verify the old primary's label was changed to unhealthy on the API server
+		var updated corev1.Pod
+		Expect(env.client.Get(ctx, client.ObjectKeyFromObject(&primary), &updated)).To(Succeed())
+		Expect(updated.Labels[utils.ClusterInstanceRoleLabelName]).To(Equal(specs.ClusterRoleLabelUnhealthy))
+		//nolint:staticcheck
+		Expect(updated.Labels[utils.ClusterRoleLabelName]).To(Equal(specs.ClusterRoleLabelUnhealthy))
+
+		// Verify replica pods are unchanged
+		var replica1Updated corev1.Pod
+		Expect(env.client.Get(ctx, client.ObjectKeyFromObject(&replica1), &replica1Updated)).To(Succeed())
+		Expect(replica1Updated.Labels[utils.ClusterInstanceRoleLabelName]).To(Equal(specs.ClusterRoleLabelReplica))
+	})
+
+	It("does not error when the old primary is not in the pod list", func() {
+		ctx := context.Background()
+		namespace := newFakeNamespace(env.client)
+
+		replica := makePod("cluster-2", namespace, specs.ClusterRoleLabelReplica)
+		Expect(env.client.Create(ctx, &replica)).To(Succeed())
+
+		err := env.clusterReconciler.markOldPrimaryAsUnhealthy(ctx, "cluster-1", []corev1.Pod{replica})
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	It("is a no-op when the old primary already has the unhealthy label", func() {
+		ctx := context.Background()
+		namespace := newFakeNamespace(env.client)
+
+		pod := makePod("cluster-1", namespace, specs.ClusterRoleLabelUnhealthy)
+		Expect(env.client.Create(ctx, &pod)).To(Succeed())
+
+		err := env.clusterReconciler.markOldPrimaryAsUnhealthy(ctx, "cluster-1", []corev1.Pod{pod})
+		Expect(err).ToNot(HaveOccurred())
+
+		var updated corev1.Pod
+		Expect(env.client.Get(ctx, client.ObjectKeyFromObject(&pod), &updated)).To(Succeed())
+		Expect(updated.Labels[utils.ClusterInstanceRoleLabelName]).To(Equal(specs.ClusterRoleLabelUnhealthy))
+	})
+
+	It("surfaces the Patch error so callers can apply their best-effort or retry strategy", func() {
+		ctx := context.Background()
+		namespace := newFakeNamespace(env.client)
+
+		primary := makePod("cluster-1", namespace, specs.ClusterRoleLabelPrimary)
+
+		failingClient := fake.NewClientBuilder().
+			WithScheme(env.scheme).
+			WithObjects(&primary).
+			WithInterceptorFuncs(interceptor.Funcs{
+				Patch: func(_ context.Context, _ client.WithWatch, obj client.Object,
+					_ client.Patch, _ ...client.PatchOption,
+				) error {
+					Expect(obj).To(BeAssignableToTypeOf(&corev1.Pod{}))
+					Expect(obj.GetName()).To(Equal("cluster-1"))
+					Expect(obj.GetNamespace()).To(Equal(namespace))
+					return fmt.Errorf("simulated API server error")
+				},
+			}).
+			Build()
+
+		r := &ClusterReconciler{Client: failingClient, Scheme: env.scheme}
+
+		err := r.markOldPrimaryAsUnhealthy(ctx, "cluster-1", []corev1.Pod{primary})
+		Expect(err).To(MatchError(ContainSubstring("simulated API server error")))
+	})
+})
+
 var _ = Describe("Check pods not on primary node", func() {
 	item1 := postgres.PostgresqlStatus{
 		IsPrimary: false,
diff --git a/pkg/reconciler/instance/metadata.go b/pkg/reconciler/instance/metadata.go
@@ -196,15 +196,20 @@ func updateRoleLabels(
 		if !hasRole || podRole != specs.ClusterRoleLabelPrimary || !newHasRole ||
 			newPodRole != specs.ClusterRoleLabelPrimary {
 			contextLogger.Info("Setting primary label", "pod", instance.Name)
-			utils.SetInstanceRole(instance.ObjectMeta, specs.ClusterRoleLabelPrimary)
+			utils.SetInstanceRole(&instance.ObjectMeta, specs.ClusterRoleLabelPrimary)
 			return true
 		}
 
 	default:
+		// This intentionally overwrites the transient ClusterRoleLabelUnhealthy value
+		// that the failover path sets on the old primary. This function is only reached
+		// once CurrentPrimary == TargetPrimary (the failover guard in the reconcile loop
+		// returns early otherwise), so by the time we get here the old primary has been
+		// demoted and "replica" is the correct label.
 		if !hasRole || podRole != specs.ClusterRoleLabelReplica || !newHasRole ||
 			newPodRole != specs.ClusterRoleLabelReplica {
 			contextLogger.Info("Setting replica label", "pod", instance.Name)
-			utils.SetInstanceRole(instance.ObjectMeta, specs.ClusterRoleLabelReplica)
+			utils.SetInstanceRole(&instance.ObjectMeta, specs.ClusterRoleLabelReplica)
 			return true
 		}
 	}
diff --git a/pkg/reconciler/majorupgrade/reconciler.go b/pkg/reconciler/majorupgrade/reconciler.go
@@ -223,7 +223,7 @@ func createMajorUpgradeJob(
 		cluster.GetFixedInheritedLabels(), configuration.Current)
 	utils.InheritLabels(&job.Spec.Template.ObjectMeta, cluster.Labels,
 		cluster.GetFixedInheritedLabels(), configuration.Current)
-	utils.SetInstanceRole(job.Spec.Template.ObjectMeta, specs.ClusterRoleLabelPrimary)
+	utils.SetInstanceRole(&job.Spec.Template.ObjectMeta, specs.ClusterRoleLabelPrimary)
 
 	contextLogger.Info("Creating new major upgrade Job",
 		"jobName", job.Name,
diff --git a/pkg/reconciler/persistentvolumeclaim/metadata.go b/pkg/reconciler/persistentvolumeclaim/metadata.go
@@ -82,6 +82,11 @@ func reconcileInstanceRoleLabel(
 		return nil
 	}
 	for _, instanceName := range cluster.Status.InstanceNames {
+		// PVCs inherit the role label from the instance name, independently of
+		// the pod's current label. The failover guard in the reconcile loop
+		// prevents this code from running while CurrentPrimary != TargetPrimary,
+		// so by the time we get here the old primary has already been demoted
+		// and "replica" is correct.
 		instanceRole := specs.ClusterRoleLabelReplica
 		if instanceName == cluster.Status.CurrentPrimary {
 			instanceRole = specs.ClusterRoleLabelPrimary
@@ -101,7 +106,7 @@ func reconcileInstanceRoleLabel(
 				return true
 			},
 			update: func(pvc *corev1.PersistentVolumeClaim) {
-				utils.SetInstanceRole(pvc.ObjectMeta, instanceRole)
+				utils.SetInstanceRole(&pvc.ObjectMeta, instanceRole)
 			},
 		}
 
diff --git a/pkg/specs/pods.go b/pkg/specs/pods.go
@@ -70,6 +70,9 @@ const (
 	// ClusterRoleLabelReplica is written in labels to represent replica servers
 	ClusterRoleLabelReplica = "replica"
 
+	// ClusterRoleLabelUnhealthy is applied to the old primary when a failover starts.
+	ClusterRoleLabelUnhealthy = "unhealthy"
+
 	// PostgresContainerName is the name of the container executing PostgreSQL
 	// inside one Pod
 	PostgresContainerName = "postgres"
diff --git a/pkg/utils/labels_annotations.go b/pkg/utils/labels_annotations.go
@@ -553,7 +553,7 @@ func GetInstanceRole(labels map[string]string) (string, bool) {
 }
 
 // SetInstanceRole sets both ClusterRoleLabelName and ClusterInstanceRoleLabelName on the given ObjectMeta
-func SetInstanceRole(meta metav1.ObjectMeta, role string) {
+func SetInstanceRole(meta *metav1.ObjectMeta, role string) {
 	if meta.Labels == nil {
 		meta.Labels = map[string]string{}
 	}

Original file line number	Diff line number	Diff line change
`@@ -553,7 +553,7 @@ func GetInstanceRole(labels map[string]string) (string, bool) {`
`553`	`553`	`}`
`554`	`554`
`555`	`555`	`// SetInstanceRole sets both ClusterRoleLabelName and ClusterInstanceRoleLabelName on the given ObjectMeta`
`556`		`-func SetInstanceRole(meta metav1.ObjectMeta, role string) {`
	`556`	`+func SetInstanceRole(meta *metav1.ObjectMeta, role string) {`
`557`	`557`	`if meta.Labels == nil {`
`558`	`558`	`meta.Labels = map[string]string{}`
`559`	`559`	`}`