Skip to content

Commit df20274

Browse files
committed
fix(collector): handle Aurora's unsupported pg_last_xact_replay_timestamp
Aurora PostgreSQL does not support pg_last_xact_replay_timestamp() and returns a feature_not_supported error (code 0A000) when the replication collector queries it. This causes the collector to crash on every scrape for Aurora instances. When this error is detected, the collector now falls back to a simpler query that only reads pg_is_in_recovery(), so is_replica is still reported correctly. The time-based metrics (lag_seconds and last_replay_seconds) are emitted as NaN to signal that the values are unavailable, rather than crashing the collection cycle entirely. The error is identified by checking for a *pq.Error with class "0A" (feature_not_supported) and a message that contains "Aurora", which avoids incorrectly suppressing the same error code on standard Postgres. A new test TestPgReplicationCollectorAurora covers this fallback path.
1 parent f0117d7 commit df20274

File tree

2 files changed

+100
-5
lines changed

2 files changed

+100
-5
lines changed

collector/pg_replication.go

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,12 @@ package collector
1515

1616
import (
1717
"context"
18+
"database/sql"
19+
"errors"
20+
"math"
21+
"strings"
1822

23+
"github.com/lib/pq"
1924
"github.com/prometheus/client_golang/prometheus"
2025
)
2126

@@ -72,32 +77,71 @@ var (
7277
ELSE 0
7378
END as is_replica,
7479
GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay`
80+
81+
// pgReplicationIsReplicaQuery is a fallback used when the full query fails
82+
// on Aurora PostgreSQL, which does not support pg_last_xact_replay_timestamp().
83+
pgReplicationIsReplicaQuery = `SELECT CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 END as is_replica`
7584
)
7685

86+
// isAuroraUnsupportedFunction returns true when Aurora PostgreSQL rejects a
87+
// query because it calls a function that is not supported on Aurora (e.g.
88+
// pg_last_xact_replay_timestamp). Aurora surfaces this as Postgres error class
89+
// "0A" (feature_not_supported) with a message that identifies the word "Aurora".
90+
func isAuroraUnsupportedFunction(err error) bool {
91+
var pqErr *pq.Error
92+
if errors.As(err, &pqErr) {
93+
return pqErr.Code.Class() == "0A" && strings.Contains(pqErr.Message, "Aurora")
94+
}
95+
return false
96+
}
97+
7798
func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error {
7899
db := instance.getDB()
79100
row := db.QueryRowContext(ctx,
80101
pgReplicationQuery,
81102
)
82103

83-
var lag float64
104+
var lag sql.NullFloat64
84105
var isReplica int64
85-
var replayAge float64
106+
var replayAge sql.NullFloat64
86107
err := row.Scan(&lag, &isReplica, &replayAge)
87108
if err != nil {
88-
return err
109+
if isAuroraUnsupportedFunction(err) {
110+
// Aurora PostgreSQL does not support pg_last_xact_replay_timestamp().
111+
// Emit NaN for the time-based metrics and fall back to a simpler query
112+
// that still reports is_replica.
113+
lag = sql.NullFloat64{Valid: false}
114+
replayAge = sql.NullFloat64{Valid: false}
115+
116+
row2 := db.QueryRowContext(ctx, pgReplicationIsReplicaQuery)
117+
if err2 := row2.Scan(&isReplica); err2 != nil {
118+
isReplica = 0
119+
}
120+
} else {
121+
return err
122+
}
89123
}
124+
125+
lagValue := math.NaN()
126+
if lag.Valid {
127+
lagValue = lag.Float64
128+
}
129+
replayAgeValue := math.NaN()
130+
if replayAge.Valid {
131+
replayAgeValue = replayAge.Float64
132+
}
133+
90134
ch <- prometheus.MustNewConstMetric(
91135
pgReplicationLag,
92-
prometheus.GaugeValue, lag,
136+
prometheus.GaugeValue, lagValue,
93137
)
94138
ch <- prometheus.MustNewConstMetric(
95139
pgReplicationIsReplica,
96140
prometheus.GaugeValue, float64(isReplica),
97141
)
98142
ch <- prometheus.MustNewConstMetric(
99143
pgReplicationLastReplay,
100-
prometheus.GaugeValue, replayAge,
144+
prometheus.GaugeValue, replayAgeValue,
101145
)
102146
return nil
103147
}

collector/pg_replication_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@ package collector
1414

1515
import (
1616
"context"
17+
"math"
1718
"testing"
1819

1920
"github.com/DATA-DOG/go-sqlmock"
21+
"github.com/lib/pq"
2022
"github.com/prometheus/client_golang/prometheus"
2123
dto "github.com/prometheus/client_model/go"
2224
"github.com/smartystreets/goconvey/convey"
@@ -62,3 +64,52 @@ func TestPgReplicationCollector(t *testing.T) {
6264
t.Errorf("there were unfulfilled exceptions: %s", err)
6365
}
6466
}
67+
68+
func TestPgReplicationCollectorAurora(t *testing.T) {
69+
db, mock, err := sqlmock.New()
70+
if err != nil {
71+
t.Fatalf("Error opening a stub db connection: %s", err)
72+
}
73+
defer db.Close()
74+
75+
inst := &instance{db: db}
76+
77+
// Aurora rejects the main query because pg_last_xact_replay_timestamp() is
78+
// not supported. The collector should fall back to the simpler is_replica
79+
// query and emit NaN for the time-based metrics.
80+
auroraErr := &pq.Error{
81+
Code: "0A000", // feature_not_supported
82+
Message: "pg_last_xact_replay_timestamp() is currently not supported for Aurora",
83+
}
84+
mock.ExpectQuery(sanitizeQuery(pgReplicationQuery)).WillReturnError(auroraErr)
85+
86+
fallbackColumns := []string{"is_replica"}
87+
fallbackRows := sqlmock.NewRows(fallbackColumns).AddRow(1)
88+
mock.ExpectQuery(sanitizeQuery(pgReplicationIsReplicaQuery)).WillReturnRows(fallbackRows)
89+
90+
ch := make(chan prometheus.Metric, 3)
91+
c := PGReplicationCollector{}
92+
if err := c.Update(context.Background(), inst, ch); err != nil {
93+
t.Fatalf("Unexpected error from Update on Aurora: %s", err)
94+
}
95+
close(ch)
96+
97+
metrics := make([]MetricResult, 0, 3)
98+
for m := range ch {
99+
metrics = append(metrics, readMetric(m))
100+
}
101+
102+
convey.Convey("Aurora fallback metrics", t, func() {
103+
convey.So(len(metrics), convey.ShouldEqual, 3)
104+
// lag should be NaN
105+
convey.So(math.IsNaN(metrics[0].value), convey.ShouldBeTrue)
106+
// is_replica should be 1
107+
convey.So(metrics[1].value, convey.ShouldEqual, 1)
108+
// last_replay should be NaN
109+
convey.So(math.IsNaN(metrics[2].value), convey.ShouldBeTrue)
110+
})
111+
112+
if err := mock.ExpectationsWereMet(); err != nil {
113+
t.Errorf("there were unfulfilled expectations: %s", err)
114+
}
115+
}

0 commit comments

Comments
 (0)