|
| 1 | +# Copyright (c) 2025, PostgreSQL Global Development Group |
| 2 | + |
| 3 | +# Tests for archive_mode=shared correctness on standbys: |
| 4 | +# |
| 5 | +# 1. Checkpoint on standby must NOT remove WAL segments that have a .ready |
| 6 | +# status file (i.e. not yet archived by the primary). With the bug, |
| 7 | +# XLogArchiveCheckDone() returns true unconditionally during recovery for |
| 8 | +# any mode that is not "always", so checkpoint deletes these segments. |
| 9 | +# |
| 10 | +# 2. After archiving is broken on the primary and then restored, .ready files |
| 11 | +# on the standby must eventually transition to .done (primary sends archival |
| 12 | +# status reports to the standby via the walsender). |
| 13 | + |
| 14 | +use strict; |
| 15 | +use warnings FATAL => 'all'; |
| 16 | +use PostgreSQL::Test::Cluster; |
| 17 | +use PostgreSQL::Test::Utils; |
| 18 | +use Test::More; |
| 19 | + |
| 20 | +# Use 1 MB WAL segments so we can generate many segments cheaply. |
| 21 | +my $wal_segsize = 1; |
| 22 | + |
| 23 | +# An archive command that always fails (but is recognized by the archiver as a |
| 24 | +# real failure, not a missing command). Mirrors the approach in |
| 25 | +# 020_archive_status.pl to stay portable. |
| 26 | +my $broken_command = |
| 27 | + $PostgreSQL::Test::Utils::windows_os |
| 28 | + ? q{copy "%p_does_not_exist" "%f_does_not_exist"} |
| 29 | + : q{cp "%p_does_not_exist" "%f_does_not_exist"}; |
| 30 | + |
| 31 | +my $archive_dir = PostgreSQL::Test::Utils::tempdir(); |
| 32 | +my $good_command = |
| 33 | + $PostgreSQL::Test::Utils::windows_os |
| 34 | + ? qq{copy "%p" "$archive_dir\\%f"} |
| 35 | + : qq{cp %p "$archive_dir/%f"}; |
| 36 | + |
| 37 | +############################################################################### |
| 38 | +# Set up primary with archive_mode=shared and BROKEN archiving so that every |
| 39 | +# WAL segment received by the standby gets a .ready file. |
| 40 | +############################################################################### |
| 41 | + |
| 42 | +my $primary = PostgreSQL::Test::Cluster->new('primary'); |
| 43 | +$primary->init( |
| 44 | + has_archiving => 1, |
| 45 | + allows_streaming => 1, |
| 46 | + extra => [ '--wal-segsize' => $wal_segsize ]); |
| 47 | +$primary->append_conf('postgresql.conf', qq{ |
| 48 | +archive_mode = shared |
| 49 | +archive_command = '$broken_command' |
| 50 | +wal_keep_size = 0 |
| 51 | +}); |
| 52 | +$primary->start; |
| 53 | + |
| 54 | +my $backup_name = 'standby_backup'; |
| 55 | +$primary->backup($backup_name); |
| 56 | + |
| 57 | +my $standby = PostgreSQL::Test::Cluster->new('standby'); |
| 58 | +$standby->init_from_backup($primary, $backup_name, has_streaming => 1); |
| 59 | +$standby->append_conf('postgresql.conf', qq{ |
| 60 | +archive_mode = shared |
| 61 | +archive_command = '$good_command' |
| 62 | +wal_receiver_status_interval = 1s |
| 63 | +wal_keep_size = 0 |
| 64 | +}); |
| 65 | +$standby->start; |
| 66 | + |
| 67 | +$primary->wait_for_catchup($standby); |
| 68 | + |
| 69 | +############################################################################### |
| 70 | +# Generate WAL while archiving is broken. |
| 71 | +# The walreceiver will create .ready files for every received segment. |
| 72 | +############################################################################### |
| 73 | + |
| 74 | +$primary->safe_psql('postgres', 'CREATE TABLE t (x int)'); |
| 75 | + |
| 76 | +# Switch WAL several times to create clearly-identifiable old segments. |
| 77 | +# We capture the name of the first switched-away segment; it is the primary |
| 78 | +# candidate that checkpoint would delete. |
| 79 | +my $target_seg = $primary->safe_psql('postgres', |
| 80 | + q{SELECT pg_walfile_name(pg_current_wal_lsn())}); |
| 81 | + |
| 82 | +for my $i (1..5) |
| 83 | +{ |
| 84 | + $primary->safe_psql('postgres', |
| 85 | + "INSERT INTO t SELECT generate_series(1,500)"); |
| 86 | + $primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); |
| 87 | +} |
| 88 | + |
| 89 | +# Wait for the archiver to register failures so we are sure archiving is |
| 90 | +# truly broken (not just slow). |
| 91 | +$primary->poll_query_until('postgres', |
| 92 | + q{SELECT failed_count > 0 FROM pg_stat_archiver}) |
| 93 | + or die "Timed out waiting for archiver to fail"; |
| 94 | + |
| 95 | +# Issue a CHECKPOINT on the primary so that the standby can form a |
| 96 | +# restartpoint whose redo LSN is past $target_seg. |
| 97 | +$primary->safe_psql('postgres', 'CHECKPOINT'); |
| 98 | + |
| 99 | +# Wait for the standby to replay everything up to that checkpoint. |
| 100 | +$primary->wait_for_catchup($standby); |
| 101 | + |
| 102 | +my $standby_wal_dir = $standby->data_dir . '/pg_wal'; |
| 103 | +my $standby_status_dir = "$standby_wal_dir/archive_status"; |
| 104 | + |
| 105 | +# The target segment must already be visible on the standby as .ready. |
| 106 | +my $target_ready = "$standby_status_dir/$target_seg.ready"; |
| 107 | +ok(-f $target_ready, |
| 108 | + "standby has .ready file for segment $target_seg (not archived by primary)"); |
| 109 | + |
| 110 | +# The WAL file itself must also be present. |
| 111 | +ok(-f "$standby_wal_dir/$target_seg", |
| 112 | + "WAL segment $target_seg exists in standby pg_wal before CHECKPOINT"); |
| 113 | + |
| 114 | +############################################################################### |
| 115 | +# Test 1: CHECKPOINT (restartpoint) on standby must not remove .ready segments |
| 116 | +############################################################################### |
| 117 | + |
| 118 | +# This triggers CreateRestartPoint, which calls RemoveOldXlogFiles. |
| 119 | +# With the bug, XLogArchiveCheckDone returns true for every segment in |
| 120 | +# archive_mode=shared during recovery, so $target_seg would be deleted. |
| 121 | +$standby->safe_psql('postgres', 'CHECKPOINT'); |
| 122 | + |
| 123 | +ok(-f "$standby_wal_dir/$target_seg", |
| 124 | + "WAL segment $target_seg still exists after CHECKPOINT on standby " |
| 125 | + . "(not deleted despite .ready status)"); |
| 126 | + |
| 127 | +ok(-f $target_ready, |
| 128 | + ".ready file for $target_seg still present after CHECKPOINT on standby"); |
| 129 | + |
| 130 | +############################################################################### |
| 131 | +# Test 2: Restoring archiving on primary causes .ready -> .done on standby |
| 132 | +# |
| 133 | +# This part is independent of Test 1: we generate fresh WAL (with archiving |
| 134 | +# still broken) so the standby accumulates new .ready files, then restore |
| 135 | +# archiving and verify those files become .done. |
| 136 | +############################################################################### |
| 137 | + |
| 138 | +# Generate a few more segments so the standby definitely has fresh .ready files |
| 139 | +# regardless of what checkpoint may have done above. |
| 140 | +for my $i (1..3) |
| 141 | +{ |
| 142 | + $primary->safe_psql('postgres', |
| 143 | + "INSERT INTO t SELECT generate_series(1,200)"); |
| 144 | + $primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); |
| 145 | +} |
| 146 | +$primary->wait_for_catchup($standby); |
| 147 | + |
| 148 | +# Collect all current .ready files on the standby. |
| 149 | +my @ready_segs; |
| 150 | +if (opendir(my $dh, $standby_status_dir)) |
| 151 | +{ |
| 152 | + @ready_segs = |
| 153 | + map { (my $s = $_) =~ s/\.ready$//; $s } |
| 154 | + grep { /\.ready$/ } readdir($dh); |
| 155 | + closedir($dh); |
| 156 | +} |
| 157 | +note("Standby has " |
| 158 | + . scalar(@ready_segs) |
| 159 | + . " .ready segments before archiving is restored"); |
| 160 | +cmp_ok(scalar(@ready_segs), '>', 0, |
| 161 | + "standby has fresh .ready files for newly received unarchived segments"); |
| 162 | + |
| 163 | +# Restore archiving on the primary. |
| 164 | +$primary->safe_psql('postgres', qq{ |
| 165 | + ALTER SYSTEM SET archive_command TO '$good_command'; |
| 166 | + SELECT pg_reload_conf(); |
| 167 | +}); |
| 168 | + |
| 169 | +# Wait until primary has archived at least one segment. |
| 170 | +$primary->poll_query_until('postgres', |
| 171 | + q{SELECT archived_count > 0 FROM pg_stat_archiver}) |
| 172 | + or die "Timed out waiting for primary to start archiving after restore"; |
| 173 | + |
| 174 | +# Generate one more WAL switch so the walsender picks up the updated |
| 175 | +# last_archived_wal and sends a fresh archival report to the standby. |
| 176 | +# (The walsender only sends when last_archived_wal changes and every |
| 177 | +# ARCHIVAL_REPORT_INTERVAL = 10 s at most.) |
| 178 | +$primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); |
| 179 | +$primary->wait_for_catchup($standby); |
| 180 | + |
| 181 | +# Poll until all previously-.ready segments have become .done. |
| 182 | +# Allow up to the framework default timeout (usually 120 s); the walsender |
| 183 | +# reports every 10 s so convergence should happen well within that. |
| 184 | +my $remaining_ready = scalar(@ready_segs); |
| 185 | +for my $i (1 .. $PostgreSQL::Test::Utils::timeout_default) |
| 186 | +{ |
| 187 | + $remaining_ready = 0; |
| 188 | + if (opendir(my $dh, $standby_status_dir)) |
| 189 | + { |
| 190 | + # Count only the segments that were .ready before archiving was restored |
| 191 | + for my $seg (@ready_segs) |
| 192 | + { |
| 193 | + $remaining_ready++ if -f "$standby_status_dir/$seg.ready"; |
| 194 | + } |
| 195 | + closedir($dh); |
| 196 | + } |
| 197 | + last if $remaining_ready == 0; |
| 198 | + sleep(1); |
| 199 | +} |
| 200 | + |
| 201 | +is($remaining_ready, 0, |
| 202 | + "all .ready files on standby transitioned to .done " |
| 203 | + . "after archiving restored on primary"); |
| 204 | + |
| 205 | +# Sanity-check: the WAL files are still present (they weren't deleted by |
| 206 | +# checkpoint while .ready, nor disappeared otherwise). |
| 207 | +my @still_missing = grep { !-f "$standby_wal_dir/$_" } @ready_segs; |
| 208 | +is(scalar(@still_missing), 0, |
| 209 | + "WAL segments were not lost while waiting for archival reports"); |
| 210 | + |
| 211 | +done_testing(); |
0 commit comments