wip

jwnx · jwnx · commit f25243264d60 · 2026-01-26T20:30:06.000-03:00
diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go
@@ -262,11 +262,11 @@ func newObservedInstances(
 	return &observed
 }
 
-// writablePod looks at observedInstances and finds an instance that matches
+// WritablePod looks at observedInstances and finds an instance that matches
 // a few conditions. The instance should be non-terminating, running, and
 // writable i.e. the instance with the primary. If such an instance exists, it
 // is returned along with the instance pod.
-func (observed *observedInstances) writablePod(container string) (*corev1.Pod, *Instance) {
+func (observed *observedInstances) WritablePod(container string) (*corev1.Pod, *Instance) {
 	if observed == nil {
 		return nil, nil
 	}
diff --git a/internal/controller/postgrescluster/instance_test.go b/internal/controller/postgrescluster/instance_test.go
@@ -381,7 +381,7 @@ func TestWritablePod(t *testing.T) {
 	t.Run("empty observed", func(t *testing.T) {
 		observed := &observedInstances{}
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod == nil)
 		assert.Assert(t, instance == nil)
 	})
@@ -415,7 +415,7 @@ func TestWritablePod(t *testing.T) {
 		terminating, known := observed.forCluster[0].IsTerminating()
 		assert.Assert(t, terminating && known)
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod == nil)
 		assert.Assert(t, instance == nil)
 	})
@@ -447,7 +447,7 @@ func TestWritablePod(t *testing.T) {
 		running, known := observed.forCluster[0].IsRunning(container)
 		assert.Check(t, !running && known)
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod == nil)
 		assert.Assert(t, instance == nil)
 	})
@@ -480,7 +480,7 @@ func TestWritablePod(t *testing.T) {
 		writable, known := observed.forCluster[0].IsWritable()
 		assert.Check(t, !writable && known)
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod == nil)
 		assert.Assert(t, instance == nil)
 	})
@@ -517,7 +517,7 @@ func TestWritablePod(t *testing.T) {
 		running, known := observed.forCluster[0].IsRunning(container)
 		assert.Check(t, running && known)
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod != nil)
 		assert.Assert(t, instance != nil)
 	})
diff --git a/internal/controller/postgrescluster/pgbouncer.go b/internal/controller/postgrescluster/pgbouncer.go
@@ -55,7 +55,8 @@ func (r *Reconciler) reconcilePGBouncer(
 		err = r.reconcilePGBouncerInPostgreSQL(ctx, cluster, instances, secret)
 	}
 	if err == nil {
-		// Trigger RECONNECT if primary has changed to force new server connections.
+		// Send SIGTERM to PgBouncer if primary has changed, triggering graceful
+		// shutdown and container restart. New process will do fresh DNS lookup.
 		// This prevents stale connections from routing traffic to a demoted replica.
 		err = r.reconcilePGBouncerReconnect(ctx, cluster, instances)
 	}
@@ -116,18 +117,9 @@ func (r *Reconciler) reconcilePGBouncerInPostgreSQL(
 ) error {
 	log := logging.FromContext(ctx)
 
-	var pod *corev1.Pod
-
 	// Find the PostgreSQL instance that can execute SQL that writes to every
 	// database. When there is none, return early.
-
-	for _, instance := range instances.forCluster {
-		writable, known := instance.IsWritable()
-		if writable && known && len(instance.Pods) > 0 {
-			pod = instance.Pods[0]
-			break
-		}
-	}
+	pod, _ := instances.WritablePod(naming.ContainerDatabase)
 	if pod == nil {
 		return nil
 	}
@@ -590,15 +582,32 @@ func (r *Reconciler) reconcilePGBouncerPodDisruptionBudget(
 	return err
 }
 
-// reconcilePGBouncerReconnect triggers a RECONNECT command on all PgBouncer
-// pods when the primary has changed. This forces PgBouncer to establish new
+// pgbouncerPods returns a list of PgBouncer pods for the given cluster.
+func (r *Reconciler) pgbouncerPods(ctx context.Context, cluster *v1beta1.PostgresCluster) (*corev1.PodList, error) {
+	pgbouncerPods := &corev1.PodList{}
+	selector, err := naming.AsSelector(naming.ClusterPGBouncerSelector(cluster))
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+
+	if err := r.Client.List(ctx, pgbouncerPods,
+		client.InNamespace(cluster.Namespace),
+		client.MatchingLabelsSelector{Selector: selector}); err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return pgbouncerPods, nil
+}
+
+// reconcilePGBouncerReconnect is a sub-reconciler that signals PgBouncer pods
+// when the primary has changed. This forces PgBouncer to establish new
 // server connections to the correct primary, preventing stale connections
 // from routing traffic to a demoted replica after failover.
 //
 // Note: RECONNECT closes server connections when they are "released" according
 // to the pool mode. In transaction mode, this happens after each transaction.
 // In session mode, this happens when the client disconnects - so persistent
 // clients may continue hitting the old primary until they reconnect.
+// It returns error for integration with the parent reconciler's error handling chain.
 func (r *Reconciler) reconcilePGBouncerReconnect(
 	ctx context.Context, cluster *v1beta1.PostgresCluster,
 	instances *observedInstances,
@@ -610,79 +619,69 @@ func (r *Reconciler) reconcilePGBouncerReconnect(
 		return nil
 	}
 
-	var primaryPod *corev1.Pod
-	for _, instance := range instances.forCluster {
-		// Same condition as writablePod fn
-		if writable, known := instance.IsWritable(); writable && known && len(instance.Pods) > 0 {
-			primaryPod = instance.Pods[0]
-			break
-		}
-	}
-
+	primaryPod, _ := instances.WritablePod(naming.ContainerDatabase)
 	if primaryPod == nil {
 		// We will retry later.
-		log.V(1).Info("No writable instance found, skipping PgBouncer RECONNECT")
+		log.V(1).Info("No writable instance found, skipping PgBouncer failover signal")
 		return nil
 	}
 
 	currentPrimaryUID := string(primaryPod.UID)
-	lastReconnectUID := cluster.Status.Proxy.PGBouncer.LastReconnectPrimaryUID
+	lastFailoverUID := cluster.Status.Proxy.PGBouncer.LastFailoverPrimaryUID
 
-	if currentPrimaryUID == lastReconnectUID {
-		// Primary hasn't changed, no need to Reconnect.
+	if currentPrimaryUID == lastFailoverUID {
+		// Primary hasn't changed, no need to trigger failover.
 		return nil
 	}
 
-	log.Info("Primary changed, triggering PgBouncer RECONNECT",
-		"previousPrimaryUID", lastReconnectUID,
+	log.Info("Primary changed, triggering PgBouncer failover signal (SIGTERM)",
+		"previousPrimaryUID", lastFailoverUID,
 		"currentPrimaryUID", currentPrimaryUID,
 		"currentPrimaryName", primaryPod.Name)
 
-	pgbouncerPods := &corev1.PodList{}
-	selector, err := naming.AsSelector(naming.ClusterPGBouncerSelector(cluster))
+	pgbouncerPods, err := r.pgbouncerPods(ctx, cluster)
 	if err != nil {
-		return errors.WithStack(err)
-	}
-
-	if err := r.Client.List(ctx, pgbouncerPods,
-		client.InNamespace(cluster.Namespace),
-		client.MatchingLabelsSelector{Selector: selector}); err != nil {
-		return errors.WithStack(err)
+		return err
 	}
 
-	// Send RECONNECT to each running PgBouncer pod
-	var reconnectErr error
+	// Send SIGTERM to each running PgBouncer pod to trigger graceful shutdown
+	// and container restart. New PgBouncer process will do fresh DNS lookup.
+	var failoverErrs []error
 	successCount := 0
 
 	for i := range pgbouncerPods.Items {
-		pod := &pgbouncerPods.Items[i]
+		pod := pgbouncerPods.Items[i] // Copy value to avoid closure reference issues
 		if pod.Status.Phase != corev1.PodRunning {
 			continue
 		}
 
-		exec := func(ctx context.Context, stdin io.Reader, stdout, stderr io.Writer, command ...string) error {
+		if err := pgbouncer.SignalFailover(ctx, func(ctx context.Context, stdin io.Reader, stdout, stderr io.Writer, command ...string) error {
 			return r.PodExec(ctx, pod.Namespace, pod.Name, naming.ContainerPGBouncer, stdin, stdout, stderr, command...)
-		}
-
-		if err := pgbouncer.Reconnect(ctx, exec); err != nil {
-			log.Error(err, "PgBouncer RECONNECT: failed to issue command to pod.", "pod", pod.Name)
-			reconnectErr = err
+		}); err != nil {
+			log.Error(err, "PgBouncer failover signal: failed to send SIGTERM to pod", "pod", pod.Name)
+			failoverErrs = append(failoverErrs, fmt.Errorf("pod %s: %w", pod.Name, err))
 		} else {
 			successCount++
 		}
 	}
 
-	// If we can't send a RECONNECT command to one of the pods, we won't update the LastReconnectPrimaryUID.
-	// This means this will run again in the next reconciliation loop.
-	if reconnectErr == nil {
-		cluster.Status.Proxy.PGBouncer.LastReconnectPrimaryUID = currentPrimaryUID
+	// Update status only if all pods were successfully signaled.
+	// Partial failures will be retried in the next reconciliation loop.
+	if len(failoverErrs) == 0 {
+		cluster.Status.Proxy.PGBouncer.LastFailoverPrimaryUID = currentPrimaryUID
 	}
 
-	log.Info("PgBouncer RECONNECT: done",
-		"failed", reconnectErr != nil,
+	log.Info("PgBouncer failover signal: done",
+		"failed", len(failoverErrs) > 0,
 		"successCount", successCount,
+		"failureCount", len(failoverErrs),
 		"totalPods", len(pgbouncerPods.Items),
 	)
 
-	return reconnectErr
+	// Return aggregated errors if any pods failed
+	if len(failoverErrs) > 0 {
+		return fmt.Errorf("failed to signal %d of %d pgbouncer pods: %w",
+			len(failoverErrs), len(pgbouncerPods.Items), failoverErrs[0])
+	}
+	return nil
 }
diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go
@@ -58,7 +58,7 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context,
 
 	// Find the PostgreSQL instance that can execute SQL that writes to every
 	// database. When there is none, return early.
-	writablePod, writableInstance = instances.writablePod(naming.ContainerDatabase)
+	writablePod, writableInstance = instances.WritablePod(naming.ContainerDatabase)
 	if writableInstance == nil || writablePod == nil {
 		return nil
 	}
diff --git a/internal/controller/postgrescluster/postgres.go b/internal/controller/postgrescluster/postgres.go
@@ -204,7 +204,7 @@ func (r *Reconciler) reconcilePostgresDatabases(
 
 	// Find the PostgreSQL instance that can execute SQL that writes system
 	// catalogs. When there is none, return early.
-	pod, _ := instances.writablePod(container)
+	pod, _ := instances.WritablePod(container)
 	if pod == nil {
 		return nil
 	}
@@ -1047,7 +1047,7 @@ func (r *Reconciler) reconcileDatabaseInitSQL(ctx context.Context,
 	// Now that we have the data provided by the user. We can check for a
 	// writable pod and get the podExecutor for the pod's database container
 	var podExecutor postgres.Executor
-	pod, _ := instances.writablePod(naming.ContainerDatabase)
+	pod, _ := instances.WritablePod(naming.ContainerDatabase)
 	if pod == nil {
 		log.V(1).Info("Could not find a pod with a writable database container.")
 		return nil
diff --git a/internal/pgbouncer/config.go b/internal/pgbouncer/config.go
@@ -120,17 +120,6 @@ func clusterINI(cluster *v1beta1.PostgresCluster) string {
 		"server_tls_sslmode": "verify-full",
 		"server_tls_ca_file": certBackendAuthorityAbsolutePath,
 
-		// Enable Unix socket for admin console access. The special user
-		// "pgbouncer" can connect without a password when using a Unix socket
-		// from the same UID as the running process. This allows the operator
-		// to send admin commands like RECONNECT after failover.
-		// Ref.: https://www.pgbouncer.org/usage.html#admin-console
-		"unix_socket_dir": "/tmp/pgbouncer",
-
-		// Allow the "pgbouncer" user to run admin commands (PAUSE, RESUME,
-		// RECONNECT, etc.) on the admin console. Combined with unix_socket_dir,
-		// this enables password-free admin access from within the container.
-		"admin_users": "pgbouncer",
 	}
 
 	// Override the above with any specified settings.
diff --git a/internal/pgbouncer/reconnect.go b/internal/pgbouncer/reconnect.go
diff --git a/internal/pgbouncer/signal_failover.go b/internal/pgbouncer/signal_failover.go
@@ -0,0 +1,36 @@
+package pgbouncer
+
+import (
+	"context"
+
+	"github.com/percona/percona-postgresql-operator/internal/logging"
+	"github.com/percona/percona-postgresql-operator/internal/postgres"
+)
+
+// SignalFailover sends a SIGTERM to PgBouncer to trigger SHUTDOWN WAIT_FOR_CLIENTS [1]
+// mode, waiting for clients to gracefully disconnect [2]. This approach was
+// suggested by a PgBouncer maintainer [3] to deal with failovers in Kubernetes.
+//
+// What happens:
+//  1. Operator sends SIGTERM [2] to PgBouncer process (PID 1 in container)
+//  2. PgBouncer enters SHUTDOWN WAIT_FOR_CLIENTS mode [1].
+//  3. After Kubernetes grace period (default 30s), SIGKILL is sent if process still hasn't exited
+//  4. Container is terminated and restarted by Kubernetes Deployment controller.
+//  5. New PgBouncer process does fresh DNS lookup → connects to current primary.
+//
+// This approach is more effective than RECONNECT command for session mode with persistent
+// clients (MPG clusters) because RECONNECT waits for clients to disconnect, which never happens
+// for persistent clients. SIGTERM will guarantee termination and restarts after a grace period.
+//
+// [1] https://www.pgbouncer.org/usage.html#shutdown
+// [2] https://www.pgbouncer.org/usage.html#signals
+// [3] https://github.com/pgbouncer/pgbouncer/issues/1361
+func SignalFailover(ctx context.Context, exec postgres.Executor) error {
+	log := logging.FromContext(ctx)
+	log.Info("SignalFailover: sending SIGTERM to force container restart")
+
+	err := exec(ctx, nil, nil, nil, "kill", "-TERM", "1")
+
+	log.Info("SignalFailover: SIGTERM sent.", "failed", err != nil)
+	return err
+}
diff --git a/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go b/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go
@@ -174,9 +174,9 @@ type PGBouncerPodStatus struct {
 	// Total number of non-terminated pods.
 	Replicas int32 `json:"replicas,omitempty"`
 
-	// Identifies the primary pod UID when RECONNECT was last triggered.
-	// Used to detect failovers and force PgBouncer to establish new
-	// server connections to the correct primary.
+	// Identifies the primary pod UID when failover signal (SIGTERM) was last triggered.
+	// Used to detect failovers and trigger PgBouncer container restart for fresh
+	// connection pool and DNS lookup to the correct primary.
 	// +optional
-	LastReconnectPrimaryUID string `json:"lastReconnectPrimaryUID,omitempty"`
+	LastFailoverPrimaryUID string `json:"lastFailoverPrimaryUID,omitempty"`
 }

Original file line number	Diff line number	Diff line change
`@@ -262,11 +262,11 @@ func newObservedInstances(`
`262`	`262`	`return &observed`
`263`	`263`	`}`
`264`	`264`
`265`		`-// writablePod looks at observedInstances and finds an instance that matches`
	`265`	`+// WritablePod looks at observedInstances and finds an instance that matches`
`266`	`266`	`// a few conditions. The instance should be non-terminating, running, and`
`267`	`267`	`// writable i.e. the instance with the primary. If such an instance exists, it`
`268`	`268`	`// is returned along with the instance pod.`
`269`		`-func (observed observedInstances) writablePod(container string) (corev1.Pod, *Instance) {`
	`269`	`+func (observed observedInstances) WritablePod(container string) (corev1.Pod, *Instance) {`
`270`	`270`	`if observed == nil {`
`271`	`271`	`return nil, nil`
`272`	`272`	`}`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context,`
`58`	`58`
`59`	`59`	`// Find the PostgreSQL instance that can execute SQL that writes to every`
`60`	`60`	`// database. When there is none, return early.`
`61`		`- writablePod, writableInstance = instances.writablePod(naming.ContainerDatabase)`
	`61`	`+ writablePod, writableInstance = instances.WritablePod(naming.ContainerDatabase)`
`62`	`62`	`if writableInstance == nil \|\| writablePod == nil {`
`63`	`63`	`return nil`
`64`	`64`	`}`