Skip to content

Commit c42c7e2

Browse files
Fix segfault when recovering 2pc transaction
When promoting a mirror segment due to failover we have seen a stacktrace like this: ``` "FATAL","58P01","requested WAL segment pg_xlog/00000023000071D50000001F has already been removed",,,,,,,0,,"xlogutils.c",580,"Stack trace: 1 0x557bdb9f09b6 postgres errstart + 0x236 2 0x557bdb5fc6cf postgres <symbol not found> + 0xdb5fc6cf 3 0x557bdb5fd021 postgres read_local_xlog_page + 0x191 4 0x557bdb5fb922 postgres <symbol not found> + 0xdb5fb922 5 0x557bdb5fba11 postgres XLogReadRecord + 0xa1 6 0x557bdb5e7767 postgres RecoverPreparedTransactions + 0xd7 7 0x557bdb5f608b postgres StartupXLOG + 0x2a3b 8 0x557bdb870a89 postgres StartupProcessMain + 0x139 9 0x557bdb62f489 postgres AuxiliaryProcessMain + 0x549 10 0x557bdb86d275 postgres <symbol not found> + 0xdb86d275 11 0x557bdb8704e3 postgres PostmasterMain + 0x1213 12 0x557bdb56a1f7 postgres main + 0x497 13 0x7fded4a61c87 libc.so.6 __libc_start_main + 0xe7 ``` Note: stacktrace is from one of our production GP clusters and might be slightly different from what we will see in cloudberry, but the failure is still present here as well. Testcase proves it. PG13 and PG14 have a fix for this bug, but it's doesn't have any test case and looks like we didn't cherry-pick that far. The discussion can be found here: https://www.postgresql.org/message-id/flat/743b9b45a2d4013bd90b6a5cba8d6faeb717ee34.camel%40cybertec.at In a few words, StartupXLOG() renames the last wal segment to .partial but tries to read it by the old name later in RecoverPreparedTransactions(). The fix is mostly borrowed from PG14 postgres/postgres@f663b00 with some cloudberry-related exceptions. Also added a regression test which segfaults without this fix for any version of GP, PG<=12 or Cloudberry.
1 parent 853596b commit c42c7e2

File tree

4 files changed

+336
-73
lines changed

4 files changed

+336
-73
lines changed

src/backend/access/transam/xlog.c

Lines changed: 73 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -8321,6 +8321,79 @@ StartupXLOG(void)
83218321
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
83228322
}
83238323

8324+
/*
8325+
* Preallocate additional log files, if wanted.
8326+
*/
8327+
PreallocXlogFiles(EndOfLog);
8328+
8329+
/*
8330+
* Okay, we're officially UP.
8331+
*/
8332+
InRecovery = false;
8333+
8334+
/*
8335+
* Hook for plugins to do additional startup works.
8336+
*
8337+
* Allow to write any WALs in hook.
8338+
*/
8339+
if (Startup_hook)
8340+
{
8341+
LocalSetXLogInsertAllowed();
8342+
(*Startup_hook) ();
8343+
LocalXLogInsertAllowed = -1;
8344+
}
8345+
8346+
/* start the archive_timeout timer and LSN running */
8347+
XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
8348+
XLogCtl->lastSegSwitchLSN = EndOfLog;
8349+
8350+
/* also initialize latestCompletedXid, to nextXid - 1 */
8351+
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
8352+
ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
8353+
ShmemVariableCache->latestCompletedGxid = ShmemVariableCache->nextGxid;
8354+
FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
8355+
if (IsNormalProcessingMode())
8356+
elog(LOG, "latest completed transaction id is %u and next transaction id is %u",
8357+
XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid),
8358+
XidFromFullTransactionId(ShmemVariableCache->nextXid));
8359+
LWLockRelease(ProcArrayLock);
8360+
8361+
/*
8362+
* Start up subtrans, if not already done for hot standby. (commit
8363+
* timestamps are started below, if necessary.)
8364+
*/
8365+
if (standbyState == STANDBY_DISABLED)
8366+
{
8367+
StartupCLOG();
8368+
StartupSUBTRANS(oldestActiveXID);
8369+
DistributedLog_Startup(oldestActiveXID,
8370+
XidFromFullTransactionId(ShmemVariableCache->nextXid));
8371+
}
8372+
8373+
/*
8374+
* Perform end of recovery actions for any SLRUs that need it.
8375+
*/
8376+
TrimCLOG();
8377+
TrimMultiXact();
8378+
8379+
/* Reload shared-memory state for prepared transactions */
8380+
RecoverPreparedTransactions();
8381+
8382+
/* Shut down xlogreader */
8383+
if (readFile >= 0)
8384+
{
8385+
close(readFile);
8386+
readFile = -1;
8387+
}
8388+
XLogReaderFree(xlogreader);
8389+
8390+
/*
8391+
* If any of the critical GUCs have changed, log them before we allow
8392+
* backends to write WAL.
8393+
*/
8394+
LocalSetXLogInsertAllowed();
8395+
XLogReportParameters();
8396+
83248397
if (ArchiveRecoveryRequested)
83258398
{
83268399
/*
@@ -8402,28 +8475,6 @@ StartupXLOG(void)
84028475
}
84038476
}
84048477

8405-
/*
8406-
* Preallocate additional log files, if wanted.
8407-
*/
8408-
PreallocXlogFiles(EndOfLog);
8409-
8410-
/*
8411-
* Okay, we're officially UP.
8412-
*/
8413-
InRecovery = false;
8414-
8415-
/*
8416-
* Hook for plugins to do additional startup works.
8417-
*
8418-
* Allow to write any WALs in hook.
8419-
*/
8420-
if (Startup_hook)
8421-
{
8422-
LocalSetXLogInsertAllowed();
8423-
(*Startup_hook) ();
8424-
LocalXLogInsertAllowed = -1;
8425-
}
8426-
84278478
/*
84288479
* If we are a standby with contentid -1 and undergoing promotion,
84298480
* update ourselves as the new master in catalog. This does not
@@ -8433,60 +8484,9 @@ StartupXLOG(void)
84338484
bool needToPromoteCatalog = (IS_QUERY_DISPATCHER() &&
84348485
ControlFile->state == DB_IN_ARCHIVE_RECOVERY);
84358486

8436-
/* start the archive_timeout timer and LSN running */
8437-
XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
8438-
XLogCtl->lastSegSwitchLSN = EndOfLog;
8439-
8440-
/* also initialize latestCompletedXid, to nextXid - 1 */
8441-
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
8442-
ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
8443-
ShmemVariableCache->latestCompletedGxid = ShmemVariableCache->nextGxid;
8444-
FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
8445-
if (IsNormalProcessingMode())
8446-
elog(LOG, "latest completed transaction id is %u and next transaction id is %u",
8447-
XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid),
8448-
XidFromFullTransactionId(ShmemVariableCache->nextXid));
8449-
LWLockRelease(ProcArrayLock);
8450-
8451-
/*
8452-
* Start up subtrans, if not already done for hot standby. (commit
8453-
* timestamps are started below, if necessary.)
8454-
*/
8455-
if (standbyState == STANDBY_DISABLED)
8456-
{
8457-
StartupCLOG();
8458-
StartupSUBTRANS(oldestActiveXID);
8459-
DistributedLog_Startup(oldestActiveXID,
8460-
XidFromFullTransactionId(ShmemVariableCache->nextXid));
8461-
}
8462-
8463-
/*
8464-
* Perform end of recovery actions for any SLRUs that need it.
8465-
*/
8466-
TrimCLOG();
8467-
TrimMultiXact();
8468-
8469-
/* Reload shared-memory state for prepared transactions */
8470-
RecoverPreparedTransactions();
8471-
84728487
if(IsNormalProcessingMode())
84738488
ereport(LOG, (errmsg("database system is ready")));
84748489

8475-
/* Shut down xlogreader */
8476-
if (readFile >= 0)
8477-
{
8478-
close(readFile);
8479-
readFile = -1;
8480-
}
8481-
XLogReaderFree(xlogreader);
8482-
8483-
/*
8484-
* If any of the critical GUCs have changed, log them before we allow
8485-
* backends to write WAL.
8486-
*/
8487-
LocalSetXLogInsertAllowed();
8488-
XLogReportParameters();
8489-
84908490
/*
84918491
* Local WAL inserts enabled, so it's time to finish initialization of
84928492
* commit timestamp.
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
-- Test a bug where the last XLOG segment is renamed to .partial before prepared
2+
-- transactions contained in it are recovered. This would result in segfault at
3+
-- RecoverPreparedTransactions. The failure is well described in PG hackers:
4+
-- https://www.postgresql.org/message-id/743b9b45a2d4013bd90b6a5cba8d6faeb717ee34.camel%40cybertec.at
5+
-- However, in GP due to an older version of PG this error is masked by always caching
6+
-- the last opened xlog segment in XLogRead(). To actually trigger the failure we need to
7+
-- create unfinished prepared transactions in TWO different xlog segments.
8+
9+
-- Allow extra time for mirror promotion to complete recovery to avoid
10+
-- gprecoverseg BEGIN failures due to gang creation failure as some primaries
11+
-- are not up. Setting these increase the number of retries in gang creation in
12+
-- case segment is in recovery. Approximately we want to wait 30 seconds.
13+
!\retcode gpconfig -c gp_gang_creation_retry_count -v 120 --skipvalidation --masteronly;
14+
(exited with code 0)
15+
!\retcode gpconfig -c gp_gang_creation_retry_timer -v 1000 --skipvalidation --masteronly;
16+
(exited with code 0)
17+
18+
-- The last xlog segment is renamed to .partial only when archive mode is on
19+
!\retcode gpconfig -c archive_mode -v on;
20+
(exited with code 0)
21+
!\retcode gpconfig -c archive_command -v '/bin/true';
22+
(exited with code 0)
23+
!\retcode gpstop -rai;
24+
(exited with code 0)
25+
26+
1: create extension if not exists gp_inject_fault;
27+
CREATE
28+
1: create table t_rename1 (a int);
29+
CREATE
30+
1: create table t_rename2 (a int);
31+
CREATE
32+
33+
-- generate an orphaned prepare transaction in first segment
34+
1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1;
35+
gp_inject_fault
36+
-----------------
37+
Success:
38+
(1 row)
39+
-- assume (2), (1) are on different segments and one tuple is on the first segment.
40+
-- the test finally double-check that.
41+
2&: insert into t_rename1 values(2),(1); <waiting ...>
42+
1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1;
43+
gp_wait_until_triggered_fault
44+
-------------------------------
45+
Success:
46+
(1 row)
47+
48+
-- start another xlog segment (will be renamed to .partial on failover)
49+
GP_IGNORE:-- start_ignore
50+
GP_IGNORE:0U: select pg_switch_xlog();
51+
GP_IGNORE: pg_switch_xlog
52+
GP_IGNORE:----------------
53+
GP_IGNORE: 0/C0B05E0
54+
GP_IGNORE:(1 row)
55+
GP_IGNORE:-- end_ignore
56+
57+
-- inject another fault and prepare transaction in the new xlog segment
58+
1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1;
59+
gp_inject_fault
60+
-----------------
61+
Success:
62+
(1 row)
63+
1: select gp_inject_fault('dtm_broadcast_prepare', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content = -1;
64+
gp_inject_fault
65+
-----------------
66+
Success:
67+
(1 row)
68+
-- assume (4), (3) are on different segments and one tuple is on the first segment.
69+
-- the test finally double-check that.
70+
3&: insert into t_rename2 values(2),(1); <waiting ...>
71+
1: select gp_wait_until_triggered_fault('dtm_broadcast_prepare', 1, dbid) from gp_segment_configuration where role = 'p' and content = -1;
72+
gp_wait_until_triggered_fault
73+
-------------------------------
74+
Success:
75+
(1 row)
76+
77+
-- shutdown primary and make sure the segment is down
78+
-1U: select pg_ctl((SELECT datadir from gp_segment_configuration c where c.role='p' and c.content=0), 'stop', 'immediate');
79+
pg_ctl
80+
--------
81+
OK
82+
(1 row)
83+
1: select gp_request_fts_probe_scan();
84+
gp_request_fts_probe_scan
85+
---------------------------
86+
t
87+
(1 row)
88+
1: select role, preferred_role from gp_segment_configuration where content = 0;
89+
role | preferred_role
90+
------+----------------
91+
m | p
92+
p | m
93+
(2 rows)
94+
95+
-- double confirm that promote succeeds.
96+
-- also double confirm that
97+
-- 1. tuples (2) and (1) are located on two segments (thus we are testing 2pc with prepared transaction).
98+
-- 2. there are tuples on the first segment (we have been testing on the first segment).
99+
1: insert into t_rename1 values(2),(1);
100+
INSERT 2
101+
1: select gp_segment_id, * from t_rename1 order by a;
102+
gp_segment_id | a
103+
---------------+---
104+
1 | 1
105+
0 | 2
106+
(2 rows)
107+
108+
1: select gp_inject_fault('dtm_broadcast_prepare', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = -1;
109+
gp_inject_fault
110+
-----------------
111+
Success:
112+
(1 row)
113+
2<: <... completed>
114+
INSERT 2
115+
3<: <... completed>
116+
INSERT 2
117+
118+
-- confirm the "orphaned" prepared trnasaction commits finally.
119+
1: select * from t_rename1 order by a;
120+
a
121+
---
122+
1
123+
1
124+
2
125+
2
126+
(4 rows)
127+
1: select * from t_rename2 order by a;
128+
a
129+
---
130+
1
131+
2
132+
(2 rows)
133+
134+
-- recovery the nodes
135+
!\retcode gprecoverseg -a;
136+
(exited with code 0)
137+
1: select wait_until_segment_synchronized(0);
138+
wait_until_segment_synchronized
139+
---------------------------------
140+
OK
141+
(1 row)
142+
143+
!\retcode gprecoverseg -ar;
144+
(exited with code 0)
145+
1: select wait_until_segment_synchronized(0);
146+
wait_until_segment_synchronized
147+
---------------------------------
148+
OK
149+
(1 row)
150+
151+
-- verify the first segment is recovered to the original state.
152+
1: select role, preferred_role from gp_segment_configuration where content = 0;
153+
role | preferred_role
154+
------+----------------
155+
m | m
156+
p | p
157+
(2 rows)
158+
159+
-- cleanup
160+
1: drop table t_rename1;
161+
DROP
162+
1: drop table t_rename2;
163+
DROP
164+
!\retcode gpconfig -r gp_gang_creation_retry_count --skipvalidation;
165+
(exited with code 0)
166+
!\retcode gpconfig -r gp_gang_creation_retry_timer --skipvalidation;
167+
(exited with code 0)
168+
!\retcode gpconfig -r archive_mode --skipvalidation;
169+
(exited with code 0)
170+
!\retcode gpconfig -r archive_command --skipvalidation;
171+
(exited with code 0)
172+
!\retcode gpstop -rai;
173+
(exited with code 0)

src/test/isolation2/isolation2_schedule

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ test: segwalrep/dtm_recovery_on_standby
250250
test: segwalrep/commit_blocking_on_standby
251251
test: segwalrep/dtx_recovery_wait_lsn
252252
test: segwalrep/select_throttle
253+
test: segwalrep/startup_rename_prepared_xlog
253254
test: fts_manual_probe
254255
test: fts_session_reset
255256
# unstable FTS test in different arch

0 commit comments

Comments
 (0)