Skip to content

Commit 4d426fb

Browse files
davissp14Ben Iofel
and
Ben Iofel
authored
Node name fix (#268)
* Ensure implementation is migration friendly * fix * Set go version in dockerfiles * potential fix * Adjusting how Zombie state is evaluated on boot. * Fix unregistration err condition * Not used anymore * cleanup unregistration logic * Fix err check * Move this back to custom * Remove todo --------- Co-authored-by: Ben Iofel <[email protected]>
1 parent 31c3a48 commit 4d426fb

File tree

15 files changed

+281
-84
lines changed

15 files changed

+281
-84
lines changed

.github/workflows/push.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818

1919
- uses: actions/setup-go@v3
2020
with:
21-
go-version: '1.20'
21+
go-version: '1.23'
2222
go-version-file: 'go.mod'
2323
cache: true
2424

@@ -36,7 +36,7 @@ jobs:
3636

3737
- uses: actions/setup-go@v3
3838
with:
39-
go-version: '1.20'
39+
go-version: '1.23'
4040
go-version-file: 'go.mod'
4141
cache: true
4242

@@ -52,7 +52,7 @@ jobs:
5252

5353
- uses: actions/setup-go@v3
5454
with:
55-
go-version: '1.20'
55+
go-version: '1.23'
5656
go-version-file: 'go.mod'
5757
cache: true
5858

@@ -68,7 +68,7 @@ jobs:
6868

6969
- uses: actions/setup-go@v3
7070
with:
71-
go-version: '1.20'
71+
go-version: '1.23'
7272
go-version-file: 'go.mod'
7373
cache: true
7474

bin/restart-repmgrd

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
11
#!/bin/bash
22

3-
kill `cat /tmp/repmgrd.pid`
3+
if [ -f /tmp/repmgrd.pid ]; then
4+
PID=$(cat /tmp/repmgrd.pid)
5+
6+
# Check if the process is running
7+
if ps -p $PID > /dev/null 2>&1; then
8+
kill $PID
9+
fi
10+
fi

cmd/pg_unregister/main.go

+40-9
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ func main() {
2727

2828
func processUnregistration(ctx context.Context) error {
2929
encodedArg := os.Args[1]
30-
hostnameBytes, err := base64.StdEncoding.DecodeString(encodedArg)
30+
machineBytes, err := base64.StdEncoding.DecodeString(encodedArg)
3131
if err != nil {
32-
return fmt.Errorf("failed to decode hostname: %v", err)
32+
return fmt.Errorf("failed to decode machine: %v", err)
3333
}
3434

3535
node, err := flypg.NewNode()
@@ -43,19 +43,50 @@ func processUnregistration(ctx context.Context) error {
4343
}
4444
defer func() { _ = conn.Close(ctx) }()
4545

46-
member, err := node.RepMgr.MemberByHostname(ctx, conn, string(hostnameBytes))
46+
machineID := string(machineBytes)
47+
48+
if len(machineID) != 14 {
49+
return fmt.Errorf("invalid machine id: %s", machineID)
50+
}
51+
52+
member, err := node.RepMgr.MemberByNodeName(ctx, conn, machineID)
4753
if err != nil {
48-
return fmt.Errorf("failed to resolve member: %s", err)
54+
// If no rows are found, the member was either already unregistered or never registered
55+
if err != pgx.ErrNoRows {
56+
return fmt.Errorf("failed to resolve member using %s: %s", machineID, err)
57+
}
58+
}
59+
60+
// If the member exists unregister it and remove the replication slot
61+
if member != nil {
62+
if err := node.RepMgr.UnregisterMember(*member); err != nil {
63+
return fmt.Errorf("failed to unregister member: %v", err)
64+
}
65+
66+
slotName := fmt.Sprintf("repmgr_slot_%d", member.ID)
67+
if err := removeReplicationSlot(ctx, conn, slotName); err != nil {
68+
return fmt.Errorf("failed to remove replication slot: %v", err)
69+
}
4970
}
5071

51-
if err := node.RepMgr.UnregisterMember(*member); err != nil {
52-
return fmt.Errorf("failed to unregister member: %v", err)
72+
// Redirect logs to /dev/null temporarily so we don't pollute the response data.
73+
devnull, err := os.Open(os.DevNull)
74+
if err != nil {
75+
return fmt.Errorf("failed to open /dev/null: %v", err)
5376
}
77+
defer func() { _ = devnull.Close() }()
78+
79+
// Save the original log output
80+
originalLogOutput := log.Writer()
81+
82+
// Redirect logs to /dev/null
83+
log.SetOutput(devnull)
5484

55-
slotName := fmt.Sprintf("repmgr_slot_%d", member.ID)
56-
if err := removeReplicationSlot(ctx, conn, slotName); err != nil {
57-
return err
85+
if err := flypg.EvaluateClusterState(ctx, conn, node); err != nil {
86+
return fmt.Errorf("failed to evaluate cluster state: %v", err)
5887
}
88+
// Restore the original log output
89+
log.SetOutput(originalLogOutput)
5990

6091
return nil
6192
}

go.mod

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
module github.com/fly-apps/postgres-flex
22

3-
go 1.20
3+
go 1.23
44

55
require (
66
github.com/go-chi/chi/v5 v5.0.8
77
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
88
github.com/hashicorp/consul/api v1.18.0
99
github.com/jackc/pgconn v1.14.3
1010
github.com/jackc/pgx/v5 v5.5.4
11+
github.com/olekukonko/tablewriter v0.0.5
1112
github.com/pkg/errors v0.9.1
1213
github.com/pkg/term v1.1.0
14+
github.com/spf13/cobra v1.8.1
1315
github.com/superfly/fly-checks v0.0.0-20230510154016-d189351293f2
1416
golang.org/x/exp v0.0.0-20230105202349-8879d0199aa3
1517
golang.org/x/sync v0.1.0
@@ -36,8 +38,6 @@ require (
3638
github.com/mattn/go-runewidth v0.0.9 // indirect
3739
github.com/mitchellh/go-homedir v1.1.0 // indirect
3840
github.com/mitchellh/mapstructure v1.4.1 // indirect
39-
github.com/olekukonko/tablewriter v0.0.5 // indirect
40-
github.com/spf13/cobra v1.8.1 // indirect
4141
github.com/spf13/pflag v1.0.5 // indirect
4242
github.com/stretchr/objx v0.5.0 // indirect
4343
golang.org/x/crypto v0.20.0 // indirect

go.sum

+4
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ github.com/jackc/pgconn v1.14.3/go.mod h1:RZbme4uasqzybK2RK5c65VsHxoyaml09lx3tXO
6969
github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE=
7070
github.com/jackc/pgio v1.0.0/go.mod h1:oP+2QK2wFfUWgr+gxjoBH9KGBb31Eio69xUb0w5bYf8=
7171
github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65 h1:DadwsjnMwFjfWc9y5Wi/+Zz7xoE5ALHsRQlOctkOiHc=
72+
github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65/go.mod h1:5R2h2EEX+qri8jOWMbJCtaPWkrrNc7OHwsp2TCqp7ak=
7273
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
7374
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
7475
github.com/jackc/pgproto3/v2 v2.3.3 h1:1HLSx5H+tXR9pW3in3zaztoEwQYRC9SQaYUHjTSUOag=
@@ -78,6 +79,7 @@ github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZ
7879
github.com/jackc/pgx/v5 v5.5.4 h1:Xp2aQS8uXButQdnCMWNmvx6UysWQQC+u1EoizjguY+8=
7980
github.com/jackc/pgx/v5 v5.5.4/go.mod h1:ez9gk+OAat140fv9ErkZDYFWmXLfV+++K0uAOiwgm1A=
8081
github.com/jackc/puddle/v2 v2.2.1 h1:RhxXJtFG022u4ibrCSMSiu5aOq1i77R3OHKNJj77OAk=
82+
github.com/jackc/puddle/v2 v2.2.1/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
8183
github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
8284
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
8385
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@@ -139,6 +141,7 @@ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
139141
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
140142
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
141143
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
144+
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
142145
github.com/superfly/fly-checks v0.0.0-20230510154016-d189351293f2 h1:mZNiJSrmbQA/3+Vy8GLL/Q9qdHxPzjcxKv+E14GZLFs=
143146
github.com/superfly/fly-checks v0.0.0-20230510154016-d189351293f2/go.mod h1:BbqpB4y6Z/cijQqKWJ3i8LMsAoC29gzX6vsSD3Qq7uw=
144147
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
@@ -154,6 +157,7 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
154157
golang.org/x/net v0.0.0-20210410081132-afb366fc7cd1/go.mod h1:9tjilg8BloeKEkVJvy7fQ90B1CfIiPueXVOjqfkSzI8=
155158
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
156159
golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
160+
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
157161
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
158162
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
159163
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=

internal/flypg/node.go

+76-14
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@ import (
1616
"github.com/fly-apps/postgres-flex/internal/privnet"
1717
"github.com/fly-apps/postgres-flex/internal/utils"
1818
"github.com/jackc/pgx/v5"
19+
"golang.org/x/exp/slices"
1920
)
2021

2122
type Node struct {
2223
AppName string
24+
MachineID string
2325
PrivateIP string
2426
PrimaryRegion string
2527
DataDir string
@@ -52,6 +54,8 @@ func NewNode() (*Node, error) {
5254

5355
node.PrivateIP = ipv6.String()
5456

57+
node.MachineID = os.Getenv("FLY_MACHINE_ID")
58+
5559
node.PrimaryRegion = os.Getenv("PRIMARY_REGION")
5660
if node.PrimaryRegion == "" {
5761
return nil, fmt.Errorf("PRIMARY_REGION environment variable must be set")
@@ -88,7 +92,9 @@ func NewNode() (*Node, error) {
8892
UserConfigPath: "/data/repmgr.user.conf",
8993
PasswordConfigPath: "/data/.pgpass",
9094
DataDir: node.DataDir,
95+
HostName: node.Hostname(),
9196
PrivateIP: node.PrivateIP,
97+
MachineID: node.MachineID,
9298
Port: 5433,
9399
DatabaseName: "repmgr",
94100
Credentials: node.ReplCredentials,
@@ -182,7 +188,7 @@ func (n *Node) Init(ctx context.Context) error {
182188
}
183189
} else {
184190
log.Println("Provisioning standby")
185-
cloneTarget, err := n.RepMgr.ResolveMemberOverDNS(ctx)
191+
cloneTarget, err := n.RepMgr.ResolvePrimaryOverDNS(ctx)
186192
if err != nil {
187193
return fmt.Errorf("failed to resolve member over dns: %s", err)
188194
}
@@ -225,14 +231,6 @@ func (n *Node) Init(ctx context.Context) error {
225231

226232
// PostInit are operations that need to be executed against a running Postgres on boot.
227233
func (n *Node) PostInit(ctx context.Context) error {
228-
if ZombieLockExists() {
229-
log.Println("[ERROR] Manual intervention required.")
230-
log.Println("[ERROR] If a new primary has been established, consider adding a new replica with `fly machines clone <primary-machine-id>` and then remove this member.")
231-
log.Println("[ERROR] Sleeping for 5 minutes.")
232-
time.Sleep(5 * time.Minute)
233-
return fmt.Errorf("unrecoverable zombie")
234-
}
235-
236234
// Use the Postgres user on boot, since our internal user may not have been created yet.
237235
conn, err := n.NewLocalConnection(ctx, "postgres", n.OperatorCredentials)
238236
if err != nil {
@@ -265,7 +263,7 @@ func (n *Node) PostInit(ctx context.Context) error {
265263
return fmt.Errorf("failed to resolve member role: %s", err)
266264
}
267265

268-
// Restart repmgrd in the event the IP changes for an already registered node.
266+
// Restart repmgrd in the event the machine ID changes for an already registered node.
269267
// This can happen if the underlying volume is moved to a different node.
270268
daemonRestartRequired := n.RepMgr.daemonRestartRequired(member)
271269

@@ -291,14 +289,22 @@ func (n *Node) PostInit(ctx context.Context) error {
291289
return fmt.Errorf("failed to run zombie diagnosis: %s", err)
292290
}
293291

294-
// This should never happen
295-
if primary != n.PrivateIP {
292+
// This should never happen, but check anyways for correctness
293+
if primary != n.Hostname() {
296294
return fmt.Errorf("resolved primary '%s' does not match ourself '%s'. this should not happen",
297295
primary,
298-
n.PrivateIP,
296+
n.Hostname(),
299297
)
300298
}
301299

300+
// Clear the zombie lock if it exists.
301+
if ZombieLockExists() {
302+
log.Println("[INFO] Clearing zombie lock and re-enabling read/write")
303+
if err := RemoveZombieLock(); err != nil {
304+
return fmt.Errorf("failed to remove zombie lock: %s", err)
305+
}
306+
}
307+
302308
// Re-register primary to apply any configuration changes.
303309
if err := n.RepMgr.registerPrimary(daemonRestartRequired); err != nil {
304310
return fmt.Errorf("failed to re-register existing primary: %s", err)
@@ -311,6 +317,10 @@ func (n *Node) PostInit(ctx context.Context) error {
311317
}
312318
}
313319
case StandbyRoleName:
320+
if err := n.migrateNodeNameIfNeeded(ctx, repConn); err != nil {
321+
return fmt.Errorf("failed to migrate node name: %s", err)
322+
}
323+
314324
// Register existing standby to apply any configuration changes.
315325
if err := n.RepMgr.registerStandby(daemonRestartRequired); err != nil {
316326
return fmt.Errorf("failed to register existing standby: %s", err)
@@ -399,7 +409,7 @@ func (n *Node) PostInit(ctx context.Context) error {
399409
return fmt.Errorf("failed to enable repmgr: %s", err)
400410
}
401411

402-
primary, err := n.RepMgr.ResolveMemberOverDNS(ctx)
412+
primary, err := n.RepMgr.ResolvePrimaryOverDNS(ctx)
403413
if err != nil {
404414
return fmt.Errorf("failed to resolve primary member: %s", err)
405415
}
@@ -527,3 +537,55 @@ func (n *Node) handleRemoteRestore(ctx context.Context, store *state.Store) erro
527537

528538
return nil
529539
}
540+
541+
// migrate node name from 6pn to machine ID if needed
542+
func (n *Node) migrateNodeNameIfNeeded(ctx context.Context, repConn *pgx.Conn) error {
543+
primary, err := n.RepMgr.PrimaryMember(ctx, repConn)
544+
if err != nil {
545+
return fmt.Errorf("failed to resolve primary member when updating standby: %s", err)
546+
}
547+
548+
primaryConn, err := n.RepMgr.NewRemoteConnection(ctx, primary.Hostname)
549+
if err != nil {
550+
return fmt.Errorf("failed to establish connection to primary: %s", err)
551+
}
552+
defer func() { _ = primaryConn.Close(ctx) }()
553+
554+
rows, err := primaryConn.Query(ctx, "select application_name from pg_stat_replication")
555+
if err != nil {
556+
return fmt.Errorf("failed to query pg_stat_replication: %s", err)
557+
}
558+
defer rows.Close()
559+
560+
var applicationNames []string
561+
for rows.Next() {
562+
var applicationName string
563+
if err := rows.Scan(&applicationName); err != nil {
564+
return fmt.Errorf("failed to scan application_name: %s", err)
565+
}
566+
applicationNames = append(applicationNames, applicationName)
567+
}
568+
if err := rows.Err(); err != nil {
569+
return fmt.Errorf("failed to iterate over rows: %s", err)
570+
}
571+
572+
// if we find our 6pn as application_name, we need to regenerate postgresql.auto.conf and reload postgresql
573+
if slices.Contains(applicationNames, n.PrivateIP) {
574+
log.Printf("pg_stat_replication on the primary has our ipv6 address as application_name, converting to machine ID...")
575+
576+
if err := n.RepMgr.regenReplicationConf(ctx); err != nil {
577+
return fmt.Errorf("failed to clone standby: %s", err)
578+
}
579+
580+
if err := admin.ReloadPostgresConfig(ctx, repConn); err != nil {
581+
return fmt.Errorf("failed to reload postgresql: %s", err)
582+
}
583+
}
584+
585+
return nil
586+
}
587+
588+
// Hostname returns the hostname of the node.
589+
func (n *Node) Hostname() string {
590+
return fmt.Sprintf("%s.vm.%s.internal", n.MachineID, n.AppName)
591+
}

internal/flypg/readonly.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func BroadcastReadonlyChange(ctx context.Context, n *Node, enabled bool) error {
7070

7171
for _, member := range members {
7272
if member.Role == PrimaryRoleName {
73-
endpoint := fmt.Sprintf("http://[%s]:5500/%s", member.Hostname, target)
73+
endpoint := fmt.Sprintf("http://%s:5500/%s", member.Hostname, target)
7474
resp, err := http.Get(endpoint)
7575
if err != nil {
7676
log.Printf("[WARN] Failed to broadcast readonly state change to member %s: %s", member.Hostname, err)
@@ -85,7 +85,7 @@ func BroadcastReadonlyChange(ctx context.Context, n *Node, enabled bool) error {
8585
}
8686

8787
for _, member := range members {
88-
endpoint := fmt.Sprintf("http://[%s]:5500/%s", member.Hostname, RestartHaproxyEndpoint)
88+
endpoint := fmt.Sprintf("http://%s:5500/%s", member.Hostname, RestartHaproxyEndpoint)
8989
resp, err := http.Get(endpoint)
9090
if err != nil {
9191
log.Printf("[WARN] Failed to restart haproxy on member %s: %s", member.Hostname, err)

0 commit comments

Comments
 (0)