Shard move in block_writes mode fails with idle_in_transaction_session_timeout on metadata workers (#8484)

codeforall · web-flow · commit d3330fdfe182 · 2026-03-02T14:40:49.000+03:00
### Description

When performing a shard move using block_writes transfer mode (either
directly via citus_move_shard_placement or through the background
rebalancer), the operation can fail with:

```
   ERROR: terminating connection due to idle-in-transaction timeout
   CONTEXT: while executing command on &lt;worker_host&gt;:&lt;worker_port&gt;

```
The failing worker is a metadata worker that is neither the source nor
the target of the shard move.

### Root Cause
LockShardListMetadataOnWorkers() opens coordinated transactions on all
metadata workers to acquire advisory shard metadata locks via SELECT
lock_shard_metadata(...). These transactions remain open until the
entire shard move completes and the coordinated transaction commits.

In block_writes mode, the data copy phase (CopyShardsToNode) runs
synchronously between the source and target workers. Metadata workers
not involved in the copy have no commands to execute and their
connections sit completely idle-in-transaction for the entire duration
of the data copy.

For large shards, the copy can take significantly longer than common
idle_in_transaction_session_timeout values, When the timeout fires on an
uninvolved worker, PostgreSQL terminates the connection, causing the
shard move to fail.

This also affects shard splits, since they follow the same code path
through LockShardListMetadataOnWorkers.

### Fix
LockShardListMetadataOnWorkers() should send SET LOCAL
idle_in_transaction_session_timeout = 0 on each metadata worker
connection before acquiring the locks. SET LOCAL scopes the change to
the current transaction only, so normal sessions on the workers are
unaffected.
diff --git a/src/backend/distributed/utils/resource_lock.c b/src/backend/distributed/utils/resource_lock.c
@@ -405,7 +405,17 @@ LockShardListMetadataOnWorkers(LOCKMODE lockmode, List *shardIntervalList)
 
 	appendStringInfo(lockCommand, "])");
 
-	SendCommandToWorkersWithMetadata(lockCommand->data);
+	/*
+	 * Disable idle_in_transaction_session_timeout on metadata workers before
+	 * acquiring locks. In block_writes mode, these connections stay open for
+	 * the entire shard copy which can take hours for large shards. Without
+	 * this, the timeout would kill the connection and fail the move.
+	 * SET LOCAL scopes the change to this transaction only.
+	 */
+	List *commandList = list_make2(
+		"SET LOCAL idle_in_transaction_session_timeout = 0",
+		lockCommand->data);
+	SendCommandListToWorkersWithMetadata(commandList);
 }
 
 
diff --git a/src/test/regress/expected/shard_move_constraints_blocking.out b/src/test/regress/expected/shard_move_constraints_blocking.out
@@ -399,3 +399,87 @@ drop cascades to table "blocking shard Move Fkeys Indexes".reference_table
 drop cascades to table "blocking shard Move Fkeys Indexes".reference_table_8970028
 drop cascades to table "blocking shard Move Fkeys Indexes".index_backed_rep_identity
 DROP ROLE mx_rebalancer_blocking_role_ent;
+-- Test: block_writes shard move succeeds even when workers have a low
+-- idle_in_transaction_session_timeout. LockShardListMetadataOnWorkers opens
+-- coordinated transactions on ALL metadata workers before the data copy.
+-- Workers not involved in the copy sit idle-in-transaction for the entire
+-- duration. Without the SET LOCAL override, the timeout would kill those
+-- connections and fail the move.
+SET citus.next_shard_id TO 8980000;
+SET citus.shard_count TO 4;
+SET citus.shard_replication_factor TO 1;
+CREATE SCHEMA blocking_move_idle_timeout;
+SET search_path TO blocking_move_idle_timeout;
+-- set a very low idle_in_transaction_session_timeout on all nodes
+SELECT 1 FROM run_command_on_all_nodes(
+    'ALTER SYSTEM SET idle_in_transaction_session_timeout = ''1s''');
+ ?column?
+---------------------------------------------------------------------
+        1
+        1
+        1
+(3 rows)
+
+SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
+ ?column?
+---------------------------------------------------------------------
+        1
+        1
+        1
+(3 rows)
+
+-- allow the reload to take effect
+SELECT pg_sleep(0.5);
+ pg_sleep
+---------------------------------------------------------------------
+
+(1 row)
+
+CREATE TABLE test_move(id int PRIMARY KEY, val text);
+SELECT create_distributed_table('test_move', 'id');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i;
+-- move a shard using block_writes; should succeed despite the 1s timeout
+SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes');
+ citus_move_shard_placement
+---------------------------------------------------------------------
+
+(1 row)
+
+SELECT public.wait_for_resource_cleanup();
+ wait_for_resource_cleanup
+---------------------------------------------------------------------
+
+(1 row)
+
+-- verify data integrity after move
+SELECT count(*) FROM test_move;
+ count
+---------------------------------------------------------------------
+   100
+(1 row)
+
+-- cleanup: restore idle_in_transaction_session_timeout
+SELECT 1 FROM run_command_on_all_nodes(
+    'ALTER SYSTEM RESET idle_in_transaction_session_timeout');
+ ?column?
+---------------------------------------------------------------------
+        1
+        1
+        1
+(3 rows)
+
+SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
+ ?column?
+---------------------------------------------------------------------
+        1
+        1
+        1
+(3 rows)
+
+DROP SCHEMA blocking_move_idle_timeout CASCADE;
+NOTICE:  drop cascades to table test_move
diff --git a/src/test/regress/sql/shard_move_constraints_blocking.sql b/src/test/regress/sql/shard_move_constraints_blocking.sql
@@ -222,3 +222,41 @@ ALTER TABLE sensors_2020_01_01 DROP CONSTRAINT fkey_from_child_to_child;
 \c - postgres - :master_port
 DROP SCHEMA "blocking shard Move Fkeys Indexes" CASCADE;
 DROP ROLE mx_rebalancer_blocking_role_ent;
+
+-- Test: block_writes shard move succeeds even when workers have a low
+-- idle_in_transaction_session_timeout. LockShardListMetadataOnWorkers opens
+-- coordinated transactions on ALL metadata workers before the data copy.
+-- Workers not involved in the copy sit idle-in-transaction for the entire
+-- duration. Without the SET LOCAL override, the timeout would kill those
+-- connections and fail the move.
+SET citus.next_shard_id TO 8980000;
+SET citus.shard_count TO 4;
+SET citus.shard_replication_factor TO 1;
+
+CREATE SCHEMA blocking_move_idle_timeout;
+SET search_path TO blocking_move_idle_timeout;
+
+-- set a very low idle_in_transaction_session_timeout on all nodes
+SELECT 1 FROM run_command_on_all_nodes(
+    'ALTER SYSTEM SET idle_in_transaction_session_timeout = ''1s''');
+SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
+-- allow the reload to take effect
+SELECT pg_sleep(0.5);
+
+CREATE TABLE test_move(id int PRIMARY KEY, val text);
+SELECT create_distributed_table('test_move', 'id');
+INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i;
+
+-- move a shard using block_writes; should succeed despite the 1s timeout
+SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes');
+SELECT public.wait_for_resource_cleanup();
+
+-- verify data integrity after move
+SELECT count(*) FROM test_move;
+
+-- cleanup: restore idle_in_transaction_session_timeout
+SELECT 1 FROM run_command_on_all_nodes(
+    'ALTER SYSTEM RESET idle_in_transaction_session_timeout');
+SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
+
+DROP SCHEMA blocking_move_idle_timeout CASCADE;