Fix two bugs in archive_mode=shared on standby

x4m · robot-cloud-aw · commit 65108c7844a3 · 2026-05-05T06:01:15.000Z
1. Checkpoint on standby deletes WAL with .ready status.
   XLogArchiveCheckDone() treated archive_mode=shared like archive_mode=on
   during recovery, returning true unconditionally and allowing checkpoint
   to remove WAL segments that the primary had not yet archived.
   Fix: exclude shared mode from the early-return path, same as "always".

2. Walsender never sends archival status reports after archiving is restored.
   WalSndArchivalReport() calls pgstat_fetch_stat_archiver() whose result is
   cached per-session (PGSTAT_FETCH_CONSISTENCY_CACHE by default).  The
   walsender has no transaction boundaries that would clear the cache, so
   last_archived_wal remained "" forever, and strcmp() suppressed all reports.
   Fix: call pgstat_clear_snapshot() before fetching archiver stats.

Add TAP tests in 051_archive_shared_checkpoint.pl that reproduce both bugs,
and extend 050_archive_shared.pl with checkpoint/restore scenarios.

Reviewed-by: reshke &lt;reshke@double.cloud&gt;
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
@@ -573,16 +573,22 @@ XLogArchiveCheckDone(const char *xlog)
 
 	/*
 	 * During archive recovery, the file is deletable if archive_mode is not
-	 * "always".
+	 * "always" or "shared".
+	 *
+	 * In "shared" mode the standby does not archive independently; instead it
+	 * waits for the primary to report successful archival, at which point the
+	 * walreceiver converts the .ready file to .done.  We must therefore fall
+	 * through to the .done/.ready check below so that checkpoint cannot
+	 * delete a segment whose .ready file has not yet become .done.
 	 */
-	if (!XLogArchivingAlways() &&
+	if (!XLogArchivingAlways() && !EffectiveArchiveModeIsShared() &&
 		GetRecoveryState() == RECOVERY_STATE_ARCHIVE)
 		return true;
 
 	/*
 	 * At this point of the logic, note that we are either a primary with
-	 * archive_mode set to "on" or "always", or a standby with archive_mode
-	 * set to "always".
+	 * archive_mode set to "on" or "always", a standby with archive_mode set
+	 * to "always", or a standby with archive_mode set to "shared".
 	 */
 
 	/* First check for .done --- this means archiver is done with it */
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
@@ -2810,10 +2810,13 @@ WalSndArchivalReport(void)
 		return;
 	last_archival_report_timestamp = now;
 	/*
-	 * Get archiver statistics. We use non-blocking access to avoid delaying
-	 * replication if stats collector is slow. If stats are unavailable or
-	 * stale, we'll just try again at the next interval.
+	 * Get archiver statistics.  The pgstat snapshot is cached per-session and
+	 * is only invalidated at transaction boundaries.  The walsender runs
+	 * without transaction boundaries, so we must clear the snapshot explicitly
+	 * to avoid reading stale data (e.g. last_archived_wal stuck at its initial
+	 * empty value even after the archiver has archived new segments).
 	 */
+	pgstat_clear_snapshot();
 	archiver_stats = pgstat_fetch_stat_archiver();
 	if (archiver_stats == NULL)
 		return;
diff --git a/src/test/recovery/t/050_archive_shared.pl b/src/test/recovery/t/050_archive_shared.pl
@@ -267,4 +267,142 @@
 ok($standby2_count >= 3500, "standby2 has all data (got $standby2_count rows)");
 ok($standby3_count >= 3500, "standby3 has all data (got $standby3_count rows)");
 
+###############################################################################
+# Test 5: checkpoint on standby must NOT delete WAL that has .ready status
+#
+# In archive_mode=shared, the standby relies on archival reports from the
+# primary to know when a segment is safe to delete.  Segments not yet
+# confirmed as archived have .ready files.  A checkpoint (CreateRestartPoint)
+# must not remove those WAL files because they may be needed for recovery
+# after a standby promotion if the primary never archived them.
+#
+# Root cause: XLogArchiveCheckDone() treats archive_mode=shared the same as
+# archive_mode=on during recovery, bypassing the .ready/.done check.
+###############################################################################
+
+note("Test 5: checkpoint must not delete WAL with .ready on standby");
+
+my $archive_dir5 = PostgreSQL::Test::Utils::tempdir();
+my $primary5    = PostgreSQL::Test::Cluster->new('primary5');
+$primary5->init(has_archiving => 1, allows_streaming => 1);
+$primary5->append_conf(
+	'postgresql.conf', qq{
+archive_mode = shared
+archive_command = 'cp %p "$archive_dir5/%f"'
+});
+$primary5->start;
+$primary5->safe_psql('postgres', 'CREATE TABLE t5 (i int);');
+
+# Ensure WAL activity exists in the current segment before switching.
+# pg_switch_wal() is a no-op when called at the very start of a segment,
+# so we write a row first to guarantee there is WAL to switch away from.
+$primary5->safe_psql('postgres', 'INSERT INTO t5 VALUES (0);');
+$primary5->safe_psql('postgres', 'SELECT pg_switch_wal();');
+
+# Wait for archiver to archive the switched segment
+$primary5->poll_query_until('postgres',
+	'SELECT archived_count > 0 FROM pg_stat_archiver')
+  or die "primary5: archiver did not start";
+
+# Create standby without wal_keep_size so checkpoint is free to recycle segments
+# backup() returns an empty list (bare "return"), so the backup name must be
+# stored separately before passing it to init_from_backup.
+$primary5->backup('backup5');
+my $standby5 = PostgreSQL::Test::Cluster->new('standby5');
+$standby5->init_from_backup($primary5, 'backup5', has_streaming => 1);
+$standby5->append_conf(
+	'postgresql.conf', qq{
+archive_mode = shared
+archive_command = 'cp %p "$archive_dir5/%f"'
+wal_receiver_status_interval = 1s
+});
+$standby5->start;
+$primary5->wait_for_catchup($standby5);
+
+# Break archiving on primary: new segments received by standby will get .ready
+$primary5->adjust_conf('postgresql.conf', 'archive_command', "'/bin/false'");
+$primary5->reload;
+
+# Generate several complete WAL segments.  After the standby replays all of
+# them its redo pointer is well past the first few, making those candidates
+# for checkpoint removal.
+for (1 .. 6)
+{
+	$primary5->safe_psql('postgres',
+		'INSERT INTO t5 SELECT generate_series(1,1000);');
+	$primary5->safe_psql('postgres', 'SELECT pg_switch_wal();');
+}
+$primary5->wait_for_catchup($standby5);
+
+# Collect every WAL segment that has a .ready file on the standby
+my $status_dir5 = $standby5->data_dir . '/pg_wal/archive_status';
+my @ready5;
+if (opendir(my $dh, $status_dir5))
+{
+	@ready5 = map { s/\.ready$//r } grep { /\.ready$/ } readdir($dh);
+	closedir($dh);
+}
+my $n_ready5 = scalar @ready5;
+note("Before checkpoint: $n_ready5 WAL files with .ready");
+cmp_ok($n_ready5, '>', 0, "standby has .ready WAL files before checkpoint");
+
+# Trigger CreateRestartPoint (the standby equivalent of CHECKPOINT).
+# It must not remove WAL files that carry a .ready status.
+$standby5->safe_psql('postgres', 'CHECKPOINT');
+
+my $wal_dir5 = $standby5->data_dir . '/pg_wal';
+my $deleted5 = 0;
+for my $f (@ready5)
+{
+	unless (-f "$wal_dir5/$f")
+	{
+		$deleted5++;
+		diag("BUG: $f had .ready but checkpoint deleted it from standby");
+	}
+}
+is($deleted5, 0,
+	"checkpoint does not delete WAL with .ready (not yet archived by primary)");
+
+###############################################################################
+# Test 6: after archiving is restored on primary, standby .ready -> .done
+#
+# When archive_command is broken for a while and then fixed, the primary will
+# archive the previously-failed segments.  The walsender sends an archival
+# status report to the standby which then converts .ready to .done.
+# This verifies the end-to-end recovery of the mechanism after an outage.
+###############################################################################
+
+note("Test 6: .ready files become .done after archiving restored on primary");
+
+# Capture archived_count before restoring so we can detect new archival
+my $archived_before5 =
+  $primary5->safe_psql('postgres', 'SELECT archived_count FROM pg_stat_archiver');
+
+# Restore archiving
+$primary5->adjust_conf('postgresql.conf', 'archive_command',
+	qq{'cp %p "$archive_dir5/%f"'});
+$primary5->reload;
+
+# Wait for primary to archive the segments that failed during the outage
+$primary5->poll_query_until('postgres',
+	"SELECT archived_count > $archived_before5 FROM pg_stat_archiver")
+  or die "primary5: archiver did not catch up after archive_command restored";
+
+# The walsender sends archival status reports every ~10 s.  Wait up to
+# timeout_default seconds for every .ready file to transition to .done.
+my $remaining5 = $n_ready5;
+for (my $i = 0; $i < $PostgreSQL::Test::Utils::timeout_default; $i++)
+{
+	$remaining5 = 0;
+	if (opendir(my $dh, $status_dir5))
+	{
+		$remaining5 = scalar(grep { /\.ready$/ } readdir($dh));
+		closedir($dh);
+	}
+	last if $remaining5 == 0;
+	sleep(1);
+}
+is($remaining5, 0,
+	"all .ready files become .done after archiving restored on primary");
+
 done_testing();
diff --git a/src/test/recovery/t/051_archive_shared_checkpoint.pl b/src/test/recovery/t/051_archive_shared_checkpoint.pl
@@ -0,0 +1,211 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+# Tests for archive_mode=shared correctness on standbys:
+#
+# 1. Checkpoint on standby must NOT remove WAL segments that have a .ready
+#    status file (i.e. not yet archived by the primary).  With the bug,
+#    XLogArchiveCheckDone() returns true unconditionally during recovery for
+#    any mode that is not "always", so checkpoint deletes these segments.
+#
+# 2. After archiving is broken on the primary and then restored, .ready files
+#    on the standby must eventually transition to .done (primary sends archival
+#    status reports to the standby via the walsender).
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Use 1 MB WAL segments so we can generate many segments cheaply.
+my $wal_segsize = 1;
+
+# An archive command that always fails (but is recognized by the archiver as a
+# real failure, not a missing command).  Mirrors the approach in
+# 020_archive_status.pl to stay portable.
+my $broken_command =
+  $PostgreSQL::Test::Utils::windows_os
+  ? q{copy "%p_does_not_exist" "%f_does_not_exist"}
+  : q{cp "%p_does_not_exist" "%f_does_not_exist"};
+
+my $archive_dir = PostgreSQL::Test::Utils::tempdir();
+my $good_command =
+  $PostgreSQL::Test::Utils::windows_os
+  ? qq{copy "%p" "$archive_dir\\%f"}
+  : qq{cp %p "$archive_dir/%f"};
+
+###############################################################################
+# Set up primary with archive_mode=shared and BROKEN archiving so that every
+# WAL segment received by the standby gets a .ready file.
+###############################################################################
+
+my $primary = PostgreSQL::Test::Cluster->new('primary');
+$primary->init(
+	has_archiving   => 1,
+	allows_streaming => 1,
+	extra           => [ '--wal-segsize' => $wal_segsize ]);
+$primary->append_conf('postgresql.conf', qq{
+archive_mode    = shared
+archive_command = '$broken_command'
+wal_keep_size   = 0
+});
+$primary->start;
+
+my $backup_name = 'standby_backup';
+$primary->backup($backup_name);
+
+my $standby = PostgreSQL::Test::Cluster->new('standby');
+$standby->init_from_backup($primary, $backup_name, has_streaming => 1);
+$standby->append_conf('postgresql.conf', qq{
+archive_mode               = shared
+archive_command            = '$good_command'
+wal_receiver_status_interval = 1s
+wal_keep_size              = 0
+});
+$standby->start;
+
+$primary->wait_for_catchup($standby);
+
+###############################################################################
+# Generate WAL while archiving is broken.
+# The walreceiver will create .ready files for every received segment.
+###############################################################################
+
+$primary->safe_psql('postgres', 'CREATE TABLE t (x int)');
+
+# Switch WAL several times to create clearly-identifiable old segments.
+# We capture the name of the first switched-away segment; it is the primary
+# candidate that checkpoint would delete.
+my $target_seg = $primary->safe_psql('postgres',
+	q{SELECT pg_walfile_name(pg_current_wal_lsn())});
+
+for my $i (1..5)
+{
+	$primary->safe_psql('postgres',
+		"INSERT INTO t SELECT generate_series(1,500)");
+	$primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+}
+
+# Wait for the archiver to register failures so we are sure archiving is
+# truly broken (not just slow).
+$primary->poll_query_until('postgres',
+	q{SELECT failed_count > 0 FROM pg_stat_archiver})
+  or die "Timed out waiting for archiver to fail";
+
+# Issue a CHECKPOINT on the primary so that the standby can form a
+# restartpoint whose redo LSN is past $target_seg.
+$primary->safe_psql('postgres', 'CHECKPOINT');
+
+# Wait for the standby to replay everything up to that checkpoint.
+$primary->wait_for_catchup($standby);
+
+my $standby_wal_dir    = $standby->data_dir . '/pg_wal';
+my $standby_status_dir = "$standby_wal_dir/archive_status";
+
+# The target segment must already be visible on the standby as .ready.
+my $target_ready = "$standby_status_dir/$target_seg.ready";
+ok(-f $target_ready,
+	"standby has .ready file for segment $target_seg (not archived by primary)");
+
+# The WAL file itself must also be present.
+ok(-f "$standby_wal_dir/$target_seg",
+	"WAL segment $target_seg exists in standby pg_wal before CHECKPOINT");
+
+###############################################################################
+# Test 1: CHECKPOINT (restartpoint) on standby must not remove .ready segments
+###############################################################################
+
+# This triggers CreateRestartPoint, which calls RemoveOldXlogFiles.
+# With the bug, XLogArchiveCheckDone returns true for every segment in
+# archive_mode=shared during recovery, so $target_seg would be deleted.
+$standby->safe_psql('postgres', 'CHECKPOINT');
+
+ok(-f "$standby_wal_dir/$target_seg",
+	"WAL segment $target_seg still exists after CHECKPOINT on standby "
+	  . "(not deleted despite .ready status)");
+
+ok(-f $target_ready,
+	".ready file for $target_seg still present after CHECKPOINT on standby");
+
+###############################################################################
+# Test 2: Restoring archiving on primary causes .ready -> .done on standby
+#
+# This part is independent of Test 1: we generate fresh WAL (with archiving
+# still broken) so the standby accumulates new .ready files, then restore
+# archiving and verify those files become .done.
+###############################################################################
+
+# Generate a few more segments so the standby definitely has fresh .ready files
+# regardless of what checkpoint may have done above.
+for my $i (1..3)
+{
+	$primary->safe_psql('postgres',
+		"INSERT INTO t SELECT generate_series(1,200)");
+	$primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+}
+$primary->wait_for_catchup($standby);
+
+# Collect all current .ready files on the standby.
+my @ready_segs;
+if (opendir(my $dh, $standby_status_dir))
+{
+	@ready_segs =
+	  map { (my $s = $_) =~ s/\.ready$//; $s }
+	  grep { /\.ready$/ } readdir($dh);
+	closedir($dh);
+}
+note("Standby has "
+	  . scalar(@ready_segs)
+	  . " .ready segments before archiving is restored");
+cmp_ok(scalar(@ready_segs), '>', 0,
+	"standby has fresh .ready files for newly received unarchived segments");
+
+# Restore archiving on the primary.
+$primary->safe_psql('postgres', qq{
+	ALTER SYSTEM SET archive_command TO '$good_command';
+	SELECT pg_reload_conf();
+});
+
+# Wait until primary has archived at least one segment.
+$primary->poll_query_until('postgres',
+	q{SELECT archived_count > 0 FROM pg_stat_archiver})
+  or die "Timed out waiting for primary to start archiving after restore";
+
+# Generate one more WAL switch so the walsender picks up the updated
+# last_archived_wal and sends a fresh archival report to the standby.
+# (The walsender only sends when last_archived_wal changes and every
+# ARCHIVAL_REPORT_INTERVAL = 10 s at most.)
+$primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+$primary->wait_for_catchup($standby);
+
+# Poll until all previously-.ready segments have become .done.
+# Allow up to the framework default timeout (usually 120 s); the walsender
+# reports every 10 s so convergence should happen well within that.
+my $remaining_ready = scalar(@ready_segs);
+for my $i (1 .. $PostgreSQL::Test::Utils::timeout_default)
+{
+	$remaining_ready = 0;
+	if (opendir(my $dh, $standby_status_dir))
+	{
+		# Count only the segments that were .ready before archiving was restored
+		for my $seg (@ready_segs)
+		{
+			$remaining_ready++ if -f "$standby_status_dir/$seg.ready";
+		}
+		closedir($dh);
+	}
+	last if $remaining_ready == 0;
+	sleep(1);
+}
+
+is($remaining_ready, 0,
+	"all .ready files on standby transitioned to .done "
+	  . "after archiving restored on primary");
+
+# Sanity-check: the WAL files are still present (they weren't deleted by
+# checkpoint while .ready, nor disappeared otherwise).
+my @still_missing = grep { !-f "$standby_wal_dir/$_" } @ready_segs;
+is(scalar(@still_missing), 0,
+	"WAL segments were not lost while waiting for archival reports");
+
+done_testing();