Skip to content

Commit d3330fd

Browse files
authored
Shard move in block_writes mode fails with idle_in_transaction_session_timeout on metadata workers (#8484)
### Description When performing a shard move using block_writes transfer mode (either directly via citus_move_shard_placement or through the background rebalancer), the operation can fail with: ``` ERROR: terminating connection due to idle-in-transaction timeout CONTEXT: while executing command on <worker_host>:<worker_port> ``` The failing worker is a metadata worker that is neither the source nor the target of the shard move. ### Root Cause LockShardListMetadataOnWorkers() opens coordinated transactions on all metadata workers to acquire advisory shard metadata locks via SELECT lock_shard_metadata(...). These transactions remain open until the entire shard move completes and the coordinated transaction commits. In block_writes mode, the data copy phase (CopyShardsToNode) runs synchronously between the source and target workers. Metadata workers not involved in the copy have no commands to execute and their connections sit completely idle-in-transaction for the entire duration of the data copy. For large shards, the copy can take significantly longer than common idle_in_transaction_session_timeout values, When the timeout fires on an uninvolved worker, PostgreSQL terminates the connection, causing the shard move to fail. This also affects shard splits, since they follow the same code path through LockShardListMetadataOnWorkers. ### Fix LockShardListMetadataOnWorkers() should send SET LOCAL idle_in_transaction_session_timeout = 0 on each metadata worker connection before acquiring the locks. SET LOCAL scopes the change to the current transaction only, so normal sessions on the workers are unaffected.
1 parent 546f206 commit d3330fd

File tree

3 files changed

+133
-1
lines changed

3 files changed

+133
-1
lines changed

src/backend/distributed/utils/resource_lock.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,17 @@ LockShardListMetadataOnWorkers(LOCKMODE lockmode, List *shardIntervalList)
405405

406406
appendStringInfo(lockCommand, "])");
407407

408-
SendCommandToWorkersWithMetadata(lockCommand->data);
408+
/*
409+
* Disable idle_in_transaction_session_timeout on metadata workers before
410+
* acquiring locks. In block_writes mode, these connections stay open for
411+
* the entire shard copy which can take hours for large shards. Without
412+
* this, the timeout would kill the connection and fail the move.
413+
* SET LOCAL scopes the change to this transaction only.
414+
*/
415+
List *commandList = list_make2(
416+
"SET LOCAL idle_in_transaction_session_timeout = 0",
417+
lockCommand->data);
418+
SendCommandListToWorkersWithMetadata(commandList);
409419
}
410420

411421

src/test/regress/expected/shard_move_constraints_blocking.out

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,3 +399,87 @@ drop cascades to table "blocking shard Move Fkeys Indexes".reference_table
399399
drop cascades to table "blocking shard Move Fkeys Indexes".reference_table_8970028
400400
drop cascades to table "blocking shard Move Fkeys Indexes".index_backed_rep_identity
401401
DROP ROLE mx_rebalancer_blocking_role_ent;
402+
-- Test: block_writes shard move succeeds even when workers have a low
403+
-- idle_in_transaction_session_timeout. LockShardListMetadataOnWorkers opens
404+
-- coordinated transactions on ALL metadata workers before the data copy.
405+
-- Workers not involved in the copy sit idle-in-transaction for the entire
406+
-- duration. Without the SET LOCAL override, the timeout would kill those
407+
-- connections and fail the move.
408+
SET citus.next_shard_id TO 8980000;
409+
SET citus.shard_count TO 4;
410+
SET citus.shard_replication_factor TO 1;
411+
CREATE SCHEMA blocking_move_idle_timeout;
412+
SET search_path TO blocking_move_idle_timeout;
413+
-- set a very low idle_in_transaction_session_timeout on all nodes
414+
SELECT 1 FROM run_command_on_all_nodes(
415+
'ALTER SYSTEM SET idle_in_transaction_session_timeout = ''1s''');
416+
?column?
417+
---------------------------------------------------------------------
418+
1
419+
1
420+
1
421+
(3 rows)
422+
423+
SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
424+
?column?
425+
---------------------------------------------------------------------
426+
1
427+
1
428+
1
429+
(3 rows)
430+
431+
-- allow the reload to take effect
432+
SELECT pg_sleep(0.5);
433+
pg_sleep
434+
---------------------------------------------------------------------
435+
436+
(1 row)
437+
438+
CREATE TABLE test_move(id int PRIMARY KEY, val text);
439+
SELECT create_distributed_table('test_move', 'id');
440+
create_distributed_table
441+
---------------------------------------------------------------------
442+
443+
(1 row)
444+
445+
INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i;
446+
-- move a shard using block_writes; should succeed despite the 1s timeout
447+
SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes');
448+
citus_move_shard_placement
449+
---------------------------------------------------------------------
450+
451+
(1 row)
452+
453+
SELECT public.wait_for_resource_cleanup();
454+
wait_for_resource_cleanup
455+
---------------------------------------------------------------------
456+
457+
(1 row)
458+
459+
-- verify data integrity after move
460+
SELECT count(*) FROM test_move;
461+
count
462+
---------------------------------------------------------------------
463+
100
464+
(1 row)
465+
466+
-- cleanup: restore idle_in_transaction_session_timeout
467+
SELECT 1 FROM run_command_on_all_nodes(
468+
'ALTER SYSTEM RESET idle_in_transaction_session_timeout');
469+
?column?
470+
---------------------------------------------------------------------
471+
1
472+
1
473+
1
474+
(3 rows)
475+
476+
SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
477+
?column?
478+
---------------------------------------------------------------------
479+
1
480+
1
481+
1
482+
(3 rows)
483+
484+
DROP SCHEMA blocking_move_idle_timeout CASCADE;
485+
NOTICE: drop cascades to table test_move

src/test/regress/sql/shard_move_constraints_blocking.sql

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,3 +222,41 @@ ALTER TABLE sensors_2020_01_01 DROP CONSTRAINT fkey_from_child_to_child;
222222
\c - postgres - :master_port
223223
DROP SCHEMA "blocking shard Move Fkeys Indexes" CASCADE;
224224
DROP ROLE mx_rebalancer_blocking_role_ent;
225+
226+
-- Test: block_writes shard move succeeds even when workers have a low
227+
-- idle_in_transaction_session_timeout. LockShardListMetadataOnWorkers opens
228+
-- coordinated transactions on ALL metadata workers before the data copy.
229+
-- Workers not involved in the copy sit idle-in-transaction for the entire
230+
-- duration. Without the SET LOCAL override, the timeout would kill those
231+
-- connections and fail the move.
232+
SET citus.next_shard_id TO 8980000;
233+
SET citus.shard_count TO 4;
234+
SET citus.shard_replication_factor TO 1;
235+
236+
CREATE SCHEMA blocking_move_idle_timeout;
237+
SET search_path TO blocking_move_idle_timeout;
238+
239+
-- set a very low idle_in_transaction_session_timeout on all nodes
240+
SELECT 1 FROM run_command_on_all_nodes(
241+
'ALTER SYSTEM SET idle_in_transaction_session_timeout = ''1s''');
242+
SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
243+
-- allow the reload to take effect
244+
SELECT pg_sleep(0.5);
245+
246+
CREATE TABLE test_move(id int PRIMARY KEY, val text);
247+
SELECT create_distributed_table('test_move', 'id');
248+
INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i;
249+
250+
-- move a shard using block_writes; should succeed despite the 1s timeout
251+
SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes');
252+
SELECT public.wait_for_resource_cleanup();
253+
254+
-- verify data integrity after move
255+
SELECT count(*) FROM test_move;
256+
257+
-- cleanup: restore idle_in_transaction_session_timeout
258+
SELECT 1 FROM run_command_on_all_nodes(
259+
'ALTER SYSTEM RESET idle_in_transaction_session_timeout');
260+
SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
261+
262+
DROP SCHEMA blocking_move_idle_timeout CASCADE;

0 commit comments

Comments
 (0)