|
31 | 31 |
|
32 | 32 | #define CREATE_RESTORE_POINT_COMMAND "SELECT pg_catalog.pg_create_restore_point($1::text)" |
33 | 33 |
|
| 34 | +/* |
| 35 | + * BLOCK_DISTRIBUTED_WRITES_COMMAND acquires ExclusiveLock on: |
| 36 | + * 1. pg_dist_transaction - blocks 2PC commit decisions |
| 37 | + * 2. pg_dist_partition - blocks DDL operations on distributed tables |
| 38 | + * |
| 39 | + * This ensures both DML (via 2PC) and DDL are blocked on metadata nodes. |
| 40 | + */ |
| 41 | +#define BLOCK_DISTRIBUTED_WRITES_COMMAND \ |
| 42 | + "LOCK TABLE pg_catalog.pg_dist_transaction IN EXCLUSIVE MODE; " \ |
| 43 | + "LOCK TABLE pg_catalog.pg_dist_partition IN EXCLUSIVE MODE" |
34 | 44 |
|
35 | 45 | /* local functions forward declarations */ |
36 | 46 | static List * OpenConnectionsToAllWorkerNodes(LOCKMODE lockMode); |
37 | 47 | static void BlockDistributedTransactions(void); |
38 | 48 | static void CreateRemoteRestorePoints(char *restoreName, List *connectionList); |
| 49 | +static void BlockDistributedTransactionsOnAllMetadataNodes(List *connectionList); |
39 | 50 |
|
40 | 51 |
|
41 | 52 | /* exports for SQL callable functions */ |
42 | 53 | PG_FUNCTION_INFO_V1(citus_create_restore_point); |
43 | 54 |
|
44 | 55 |
|
45 | 56 | /* |
46 | | - * citus_create_restore_point blocks writes to distributed tables and then |
47 | | - * runs pg_create_restore_point on all nodes. This creates a consistent |
48 | | - * restore point under the assumption that there are no other writers |
49 | | - * than the coordinator. |
| 57 | + * citus_create_restore_point creates a cluster-consistent restore point |
| 58 | + * across all nodes in the Citus cluster. |
| 59 | + * |
| 60 | + * In coordinator-only mode, this function blocks new distributed writes |
| 61 | + * at the coordinator and creates restore points on all worker nodes. |
| 62 | + * |
| 63 | + * In MX mode (multi-writer), this function blocks both DML and DDL |
| 64 | + * operations on all metadata nodes by acquiring ExclusiveLock on: |
| 65 | + * - pg_dist_transaction: blocks 2PC commit decisions (DML) |
| 66 | + * - pg_dist_partition: blocks DDL on distributed tables |
| 67 | + * |
| 68 | + * This prevents new distributed transactions from recording commit decisions |
| 69 | + * and blocks schema changes, ensuring all restore points represent the same |
| 70 | + * consistent cluster state. |
| 71 | + * |
| 72 | + * The function returns the LSN of the restore point on the coordinator, |
| 73 | + * maintaining backward compatibility with the original implementation. |
| 74 | + * |
| 75 | + * Key insight: We do NOT need to drain in-flight transactions. The commit |
| 76 | + * decision in Citus 2PC happens when LogTransactionRecord() writes to |
| 77 | + * pg_dist_transaction, which occurs BEFORE the writer's local commit. |
| 78 | + * By blocking writes to pg_dist_transaction, we prevent commit decisions |
| 79 | + * from being made. Transactions that have already recorded their commit |
| 80 | + * decision will complete normally, while those that haven't will |
| 81 | + * be blocked. This creates a clean cut point for consistency. |
50 | 82 | */ |
51 | 83 | Datum |
52 | 84 | citus_create_restore_point(PG_FUNCTION_ARGS) |
@@ -88,22 +120,56 @@ citus_create_restore_point(PG_FUNCTION_ARGS) |
88 | 120 | * ShareLock prevents new nodes being added, rendering connectionList incomplete |
89 | 121 | */ |
90 | 122 | List *connectionList = OpenConnectionsToAllWorkerNodes(ShareLock); |
| 123 | + XLogRecPtr localRestorePoint = InvalidXLogRecPtr; |
91 | 124 |
|
92 | | - /* |
93 | | - * Send a BEGIN to bust through pgbouncer. We won't actually commit since |
94 | | - * that takes time. Instead we just close the connections and roll back, |
95 | | - * which doesn't undo pg_create_restore_point. |
96 | | - */ |
97 | | - RemoteTransactionListBegin(connectionList); |
| 125 | + PG_TRY(); |
| 126 | + { |
| 127 | + /* |
| 128 | + * Send a BEGIN to bust through pgbouncer. We won't actually commit since |
| 129 | + * that takes time. Instead we just close the connections and roll back, |
| 130 | + * which doesn't undo pg_create_restore_point. |
| 131 | + */ |
| 132 | + RemoteTransactionListBegin(connectionList); |
| 133 | + |
| 134 | + /* DANGER: finish as quickly as possible after this */ |
| 135 | + BlockDistributedTransactions(); |
98 | 136 |
|
99 | | - /* DANGER: finish as quickly as possible after this */ |
100 | | - BlockDistributedTransactions(); |
| 137 | + BlockDistributedTransactionsOnAllMetadataNodes(connectionList); |
101 | 138 |
|
102 | | - /* do local restore point first to bail out early if something goes wrong */ |
103 | | - XLogRecPtr localRestorePoint = XLogRestorePoint(restoreNameString); |
| 139 | + /* do local restore point first to bail out early if something goes wrong */ |
| 140 | + localRestorePoint = XLogRestorePoint(restoreNameString); |
104 | 141 |
|
105 | | - /* run pg_create_restore_point on all nodes */ |
106 | | - CreateRemoteRestorePoints(restoreNameString, connectionList); |
| 142 | + /* run pg_create_restore_point on all nodes */ |
| 143 | + CreateRemoteRestorePoints(restoreNameString, connectionList); |
| 144 | + |
| 145 | + /* close connections to all nodes and |
| 146 | + * all locks gets released as part of the transaction rollback |
| 147 | + */ |
| 148 | + MultiConnection *conn = NULL; |
| 149 | + foreach_declared_ptr(conn, connectionList) |
| 150 | + { |
| 151 | + ForgetResults(conn); |
| 152 | + CloseConnection(conn); |
| 153 | + } |
| 154 | + connectionList = NIL; |
| 155 | + } |
| 156 | + PG_CATCH(); |
| 157 | + { |
| 158 | + /* |
| 159 | + * On error, ensure we clean up connections and release locks. |
| 160 | + * Rolling back the metadata node transactions releases the |
| 161 | + * ExclusiveLocks on pg_dist_transaction cluster-wide. |
| 162 | + */ |
| 163 | + MultiConnection *conn = NULL; |
| 164 | + foreach_declared_ptr(conn, connectionList) |
| 165 | + { |
| 166 | + ForgetResults(conn); |
| 167 | + CloseConnection(conn); |
| 168 | + } |
| 169 | + connectionList = NIL; |
| 170 | + PG_RE_THROW(); |
| 171 | + } |
| 172 | + PG_END_TRY(); |
107 | 173 |
|
108 | 174 | PG_RETURN_LSN(localRestorePoint); |
109 | 175 | } |
@@ -152,6 +218,89 @@ BlockDistributedTransactions(void) |
152 | 218 | } |
153 | 219 |
|
154 | 220 |
|
| 221 | +/* |
| 222 | + * BlockDistributedTransactionsOnAllMetadataNodes blocks distributed transactions |
| 223 | + * on all metadata nodes by executing pg_lock_table remotely. |
| 224 | + * |
| 225 | + * This is the MX-mode equivalent of BlockDistributedTransactions(), extended |
| 226 | + * to all nodes capable of initiating distributed transactions. We must hold |
| 227 | + * these locks across the cluster to prevent commit decisions from being made |
| 228 | + * on any node. |
| 229 | + * |
| 230 | + * The function expects that connections are already in a transaction block |
| 231 | + * (BEGIN has been sent). The locks will be held until the transaction is |
| 232 | + * rolled back or committed. |
| 233 | + */ |
| 234 | +static void |
| 235 | +BlockDistributedTransactionsOnAllMetadataNodes(List *connectionList) |
| 236 | +{ |
| 237 | + /* |
| 238 | + * Send LOCK TABLE commands to all metadata nodes in parallel. We use |
| 239 | + * standard SQL LOCK TABLE syntax to acquire ExclusiveLock on catalog |
| 240 | + * tables, mirroring what BlockDistributedTransactions() does on the |
| 241 | + * coordinator via LockRelationOid(). |
| 242 | + * |
| 243 | + * The BLOCK_DISTRIBUTED_WRITES_COMMAND acquires: |
| 244 | + * 1. ExclusiveLock on pg_dist_transaction (blocks 2PC commit decisions) |
| 245 | + * 2. ExclusiveLock on pg_dist_partition (blocks DDL on distributed tables) |
| 246 | + * |
| 247 | + * Note: Unlike the local coordinator lock which also locks pg_dist_node, |
| 248 | + * we don't lock pg_dist_node on remote nodes because node management |
| 249 | + * operations (adding/removing nodes) are still coordinator-only. |
| 250 | + * |
| 251 | + * These locks naturally serialize concurrent restore point operations |
| 252 | + * cluster-wide, so no additional advisory lock is needed. |
| 253 | + */ |
| 254 | + |
| 255 | + /* Build list of remote metadata node connections */ |
| 256 | + List *metadataConnectionList = NIL; |
| 257 | + MultiConnection *connection = NULL; |
| 258 | + foreach_declared_ptr(connection, connectionList) |
| 259 | + { |
| 260 | + WorkerNode *workerNode = FindWorkerNode(connection->hostname, connection->port); |
| 261 | + bool isRemoteMetadataNode = workerNode != NULL && |
| 262 | + NodeIsPrimaryAndRemote(workerNode); |
| 263 | + |
| 264 | + if (isRemoteMetadataNode) |
| 265 | + { |
| 266 | + metadataConnectionList = lappend(metadataConnectionList, connection); |
| 267 | + } |
| 268 | + } |
| 269 | + |
| 270 | + /* Send lock commands in parallel to all remote metadata nodes */ |
| 271 | + foreach_declared_ptr(connection, metadataConnectionList) |
| 272 | + { |
| 273 | + /* |
| 274 | + * We could use ExecuteCriticalRemoteCommand instead, but it would |
| 275 | + * not allow us to execute the commands in parallel. So for sake of |
| 276 | + * performance, we use SendRemoteCommand and send lock commands in parallel |
| 277 | + * to all metadata nodes, and later wait for all lock acquisitions to complete. |
| 278 | + */ |
| 279 | + int querySent = SendRemoteCommand(connection, BLOCK_DISTRIBUTED_WRITES_COMMAND); |
| 280 | + if (querySent == 0) |
| 281 | + { |
| 282 | + ReportConnectionError(connection, ERROR); |
| 283 | + } |
| 284 | + } |
| 285 | + |
| 286 | + /* |
| 287 | + * Wait for all lock acquisitions to complete. If any node fails to |
| 288 | + * acquire locks (e.g., due to a conflicting lock), this will error out. |
| 289 | + */ |
| 290 | + foreach_declared_ptr(connection, metadataConnectionList) |
| 291 | + { |
| 292 | + PGresult *result = GetRemoteCommandResult(connection, true); |
| 293 | + if (!IsResponseOK(result)) |
| 294 | + { |
| 295 | + ReportResultError(connection, result, ERROR); |
| 296 | + } |
| 297 | + |
| 298 | + PQclear(result); |
| 299 | + ForgetResults(connection); |
| 300 | + } |
| 301 | +} |
| 302 | + |
| 303 | + |
155 | 304 | /* |
156 | 305 | * CreateRemoteRestorePoints creates a restore point via each of the |
157 | 306 | * connections in the list in parallel. |
@@ -186,6 +335,5 @@ CreateRemoteRestorePoints(char *restoreName, List *connectionList) |
186 | 335 | PQclear(result); |
187 | 336 |
|
188 | 337 | ForgetResults(connection); |
189 | | - CloseConnection(connection); |
190 | 338 | } |
191 | 339 | } |
0 commit comments