yugabyte · egladysh · May 2, 2026 · May 2, 2026 · May 2, 2026 · May 2, 2026
diff --git a/src/yb/client/client-internal.cc b/src/yb/client/client-internal.cc
@@ -990,6 +990,7 @@ Status YBClient::Data::BackfillIndex(YBClient* client,
                                      const YBTableName& index_name,
                                      const TableId& index_id,
                                      CoarseTimePoint deadline,
+                                     std::optional<TransactionMetadata> requester_transaction,
                                      bool wait) {
   BackfillIndexRequestPB req;
   BackfillIndexResponsePB resp;
@@ -1000,6 +1001,9 @@ Status YBClient::Data::BackfillIndex(YBClient* client,
   if (!index_id.empty()) {
     req.mutable_index_identifier()->set_table_id(index_id);
   }
+  if (requester_transaction.has_value()) {
+    requester_transaction->ToPB(req.mutable_requester_transaction());
+  }
 
   RETURN_NOT_OK((SyncLeaderMasterRpc(
       deadline, req, &resp, "BackfillIndex", &master::MasterDdlProxy::BackfillIndexAsync)));

diff --git a/src/yb/client/client-internal.h b/src/yb/client/client-internal.h
@@ -209,6 +209,7 @@ class YBClient::Data {
                        const YBTableName& table_name,
                        const TableId& table_id,
                        CoarseTimePoint deadline,
+                       std::optional<TransactionMetadata> requester_transaction,
                        bool wait = true);
   Status IsBackfillIndexInProgress(YBClient* client,
                                    const TableId& table_id,

diff --git a/src/yb/client/client.cc b/src/yb/client/client.cc
@@ -622,11 +622,14 @@ Status YBClient::TruncateTables(const TableIds& table_ids, bool wait) {
   return data_->TruncateTables(this, table_ids, deadline, wait);
 }
 
-Status YBClient::BackfillIndex(const TableId& table_id, bool wait, CoarseTimePoint deadline) {
+Status YBClient::BackfillIndex(const TableId& table_id,
+                               std::optional<TransactionMetadata> requester_transaction,
+                               bool wait, CoarseTimePoint deadline) {
   if (deadline == CoarseTimePoint()) {
     deadline = CoarseMonoClock::Now() + FLAGS_backfill_index_client_rpc_timeout_ms * 1ms;
   }
-  return data_->BackfillIndex(this, YBTableName(), table_id, deadline, wait);
+  return data_->BackfillIndex(
+      this, YBTableName(), table_id, deadline, std::move(requester_transaction), wait);
 }
 
 Status YBClient::GetIndexBackfillProgress(

diff --git a/src/yb/client/client.h b/src/yb/client/client.h
@@ -349,7 +349,10 @@ class YBClient {
 
   // Backfill the specified index table.  This is only supported for YSQL at the moment.
   Status BackfillIndex(
-      const TableId& table_id, bool wait = true, CoarseTimePoint deadline = CoarseTimePoint());
+      const TableId& table_id,
+      std::optional<TransactionMetadata> requester_transaction,
+      bool wait = true,
+      CoarseTimePoint deadline = CoarseTimePoint());
 
   Status GetIndexBackfillProgress(
       const TableIds& index_ids,

diff --git a/src/yb/client/client_master_rpc.cc b/src/yb/client/client_master_rpc.cc
@@ -145,7 +145,9 @@ void ClientMasterRpcBase::Finished(const Status& status) {
   }
 
   if (new_status.IsNetworkError() || new_status.IsRemoteError()) {
-    if (rpc::RpcError(new_status) != rpc::ErrorStatusPB::ERROR_NO_SUCH_METHOD) {
+    const auto rpc_error = rpc::RpcError(new_status);
+    if (rpc_error != rpc::ErrorStatusPB::ERROR_NO_SUCH_METHOD &&
+        rpc_error != rpc::ErrorStatusPB::FATAL_SERVER_SHUTTING_DOWN) {
       LOG(WARNING) << ToString() << ": Encountered a network error from the Master("
                    << client_data_->leader_master_hostport().ToString()
                    << "): " << new_status.ToString() << ", retrying...";

diff --git a/src/yb/master/backfill_index.cc b/src/yb/master/backfill_index.cc
@@ -119,6 +119,10 @@ DEFINE_test_flag(bool, skip_index_backfill, false,
 DEFINE_test_flag(bool, block_do_backfill, false,
     "Block DoBackfill from proceeding.");
 
+DEFINE_test_flag(bool, skip_ddl_requester_liveness_check, false,
+    "Skip starting the requester liveness task. Used in tests to simulate the pre-fix behavior "
+    "where master continues sending BackfillIndex RPCs after the backend is killed.");
+
 DEFINE_test_flag(bool, simulate_empty_indexes_during_backfill, false,
     "Simulates BackfillTable::indexes_to_build() to return an empty set.");
 
@@ -324,7 +328,8 @@ Status MultiStageAlterTable::StartBackfillingData(
     CatalogManager* catalog_manager,
     const scoped_refptr<TableInfo>& indexed_table,
     const std::vector<IndexInfoPB>& idx_infos,
-    std::optional<uint32_t> current_version, const LeaderEpoch& epoch) {
+    std::optional<uint32_t> current_version, const LeaderEpoch& epoch,
+    std::optional<TransactionMetadata> requester_transaction) {
   // We leave the table state as ALTERING so that a master failover can resume the backfill.
   RETURN_NOT_OK(ClearFullyAppliedAndUpdateState(
       catalog_manager, indexed_table, current_version, /* change_state to RUNNING */ false, epoch));
@@ -337,6 +342,14 @@ Status MultiStageAlterTable::StartBackfillingData(
   VLOG(0) << __func__ << " starting backfill on " << indexed_table->ToString() << " for "
           << yb::ToString(idx_infos);
 
+  // Retrieve the requester transaction if it was stored during the permission-update phase.
+  // Pass current_version so TakePendingBackfillRequesterTransaction rejects stale
+  // transactions from earlier backfill attempts.
+  if (!requester_transaction && current_version) {
-  if (!requester_transaction && current_version) {
+  if (current_version) {
-  if (!requester_transaction && current_version) {
+  if (current_version) {
+    requester_transaction =
+        indexed_table->TakePendingBackfillRequesterTransaction(*current_version);
+  }
+
   if (FLAGS_TEST_skip_index_backfill) {
     TRACE("Skipping backfill of data on tservers");
     LOG(INFO) << "Skipping backfill of data on tservers";
@@ -345,7 +358,7 @@ Status MultiStageAlterTable::StartBackfillingData(
 
   auto backfill_table = std::make_shared<BackfillTable>(
       catalog_manager->master_, catalog_manager->AsyncTaskPool(), indexed_table, idx_infos,
-      *ns_info, epoch);
+      *ns_info, epoch, std::move(requester_transaction));
   Status s = backfill_table->Launch();
   if (!s.ok()) {
     indexed_table->ClearIsBackfilling();
@@ -387,7 +400,8 @@ IndexPermissions NextPermission(IndexPermissions perm) {
 
 Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(
     CatalogManager* catalog_manager, const scoped_refptr<TableInfo>& indexed_table,
-    uint32_t current_version, const LeaderEpoch& epoch, bool respect_backfill_deferrals,
+    uint32_t current_version, const LeaderEpoch& epoch,
+    std::optional<TransactionMetadata> requester_transaction, bool respect_backfill_deferrals,
     bool update_ysql_to_backfill) {
   DVLOG_WITH_FUNC(3)
       << Format("$0, version: $1, respect_deferrals: $2, update_ysql_to_backfill: $3",
@@ -502,6 +516,15 @@ Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(
 
     if (permissions_updated.ok() && *permissions_updated) {
       VLOG(1) << "Sending alter table request with updated permissions";
+      // Store the requester transaction so StartBackfillingData can retrieve it when the
+      // permission change reaches DO_BACKFILL and the second call launches backfill.
+      // Store current_version+1 (the new version after this permission update)
+      // so TakePendingBackfillRequesterTransaction can verify the transaction
+      // belongs to this exact backfill attempt and not a stale one.
+      if (requester_transaction) {
+        indexed_table->SetPendingBackfillRequesterTransaction(
+            std::move(requester_transaction), current_version + 1);
+      }
       RETURN_NOT_OK(catalog_manager->SendAlterTableRequest(indexed_table, epoch));
       return Status::OK();
     }
@@ -530,7 +553,8 @@ Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(
     }
     WARN_NOT_OK(
         StartBackfillingData(
-            catalog_manager, indexed_table.get(), indexes_to_backfill, current_version, epoch),
+            catalog_manager, indexed_table.get(), indexes_to_backfill, current_version, epoch,
+            std::move(requester_transaction)),
         yb::Format("Could not launch backfill for $0", indexed_table->ToString()));
   }
 
@@ -627,7 +651,7 @@ std::string RetrieveIndexNames(CatalogManager* mgr,
 BackfillTable::BackfillTable(
     Master* master, ThreadPool* callback_pool, const scoped_refptr<TableInfo>& indexed_table,
     std::vector<IndexInfoPB> indexes, const scoped_refptr<NamespaceInfo>& ns_info,
-    LeaderEpoch epoch)
+    LeaderEpoch epoch, std::optional<TransactionMetadata> requester_transaction)
     : master_(master),
       callback_pool_(callback_pool),
       indexed_table_(indexed_table),
@@ -637,7 +661,8 @@ BackfillTable::BackfillTable(
           RetrieveIndexNames(master->catalog_manager_impl(), requested_index_ids_)),
       ns_info_(ns_info),
       epoch_(std::move(epoch)),
-      wait_state_(ash::WaitStateInfo::CreateIfAshIsEnabled<ash::WaitStateInfo>()) {
+      wait_state_(ash::WaitStateInfo::CreateIfAshIsEnabled<ash::WaitStateInfo>()),
+      requester_transaction_(std::move(requester_transaction)) {
   if (wait_state_) {
     if (const auto& current_state = ash::WaitStateInfo::CurrentWaitState()) {
       wait_state_->UpdateMetadata(current_state->metadata());
@@ -951,6 +976,7 @@ Status BackfillTable::DoLaunchBackfill() {
 }
 
 Status BackfillTable::DoBackfill() {
+  StartRequesterLivenessMonitor();
   while (FLAGS_TEST_block_do_backfill) {
     constexpr auto kSpinWait = 100ms;
     LOG(INFO) << Format("Blocking $0 for $1", __func__, kSpinWait);
@@ -984,6 +1010,7 @@ Status BackfillTable::Done(const Status& s, const std::unordered_set<TableId>& f
   if (!done() && --tablets_pending_ == 0) {
     LOG_WITH_PREFIX(INFO) << "Completed backfilling the index table.";
     done_.store(true, std::memory_order_release);
+    StopLivenessMonitor();
     RETURN_NOT_OK_PREPEND(
         MarkAllIndexesAsSuccess(), "Failed to mark indexes as successfully backfilled.");
     RETURN_NOT_OK_PREPEND(UpdateIndexPermissionsForIndexes(), "Failed to complete backfill.");
@@ -997,6 +1024,7 @@ Status BackfillTable::MarkIndexesAsFailed(
     const std::unordered_set<TableId>& failed_indexes, const string& message) {
   if (indexes_to_build() == failed_indexes) {
     done_.store(true, std::memory_order_release);
+    StopLivenessMonitor();
     backfill_job_->SetState(MonitoredTaskState::kFailed);
   }
   return MarkIndexesAsDesired(failed_indexes, BackfillJobPB::FAILED, message);
@@ -1077,6 +1105,47 @@ Status BackfillTable::MarkIndexesAsDesired(
   return Status::OK();
 }
 
+void BackfillTable::StartRequesterLivenessMonitor() {
+  if (!requester_transaction_) {
+      return;
+  }
+  if (PREDICT_FALSE(FLAGS_TEST_skip_ddl_requester_liveness_check)) {
+    LOG_WITH_PREFIX(INFO) << "Skipping requester liveness monitor (TEST flag set)";
+    return;
+  }
+  VLOG_WITH_PREFIX(1) << "Starting requester liveness monitor for transaction "
+                      << requester_transaction_->transaction_id;
+
+  auto self = shared_from_this();
+  BackgroundDdlCallbacks callbacks{
+      .done_ = [self] { return self->done(); },
+      .abort_ = [self] { return self->Abort(); },
+  };
+  auto task = DdlRequesterLivenessTask::CreateAndStartTask(
+      *master_->catalog_manager_impl(),
+      indexed_table_,
+      *requester_transaction_,
+      std::move(callbacks),
+      master_->client_future(),
+      *master_->messenger(),
+      epoch_);
+
+  std::lock_guard l(mutex_);
+  DCHECK(!liveness_task_) << "Liveness task already exists";
+  liveness_task_ = std::move(task);
+}
+
+void BackfillTable::StopLivenessMonitor() {
+  std::shared_ptr<DdlRequesterLivenessTask> task;
+  {
+    std::lock_guard l(mutex_);
+    task = std::move(liveness_task_);
+  }
+  if (task) {
+    task->AbortAndReturnPrevState(STATUS(Aborted, "BackfillTable is done"));
+  }
+}
+
 Status BackfillTable::Abort() {
   LOG(WARNING) << "Backfill failed/aborted.";
   RETURN_NOT_OK(MarkAllIndexesAsFailed());
@@ -1086,6 +1155,7 @@ Status BackfillTable::Abort() {
 Status BackfillTable::CheckIfDone() {
   if (indexes_to_build().empty()) {
     done_.store(true, std::memory_order_release);
+    StopLivenessMonitor();
     RETURN_NOT_OK_PREPEND(
         UpdateIndexPermissionsForIndexes(),
         "Could not update index permissions after backfill");

diff --git a/src/yb/master/backfill_index.h b/src/yb/master/backfill_index.h
@@ -27,13 +27,15 @@
 
 #include "yb/ash/wait_state.h"
 #include "yb/common/entity_ids.h"
+#include "yb/common/transaction.h"
 #include "yb/dockv/partition.h"
 
 #include "yb/gutil/integral_types.h"
 #include "yb/gutil/ref_counted.h"
 
 #include "yb/master/async_rpc_tasks_base.h"
 #include "yb/master/catalog_entity_info.h"
+#include "yb/master/ysql_ddl_verification_task.h"
 
 #include "yb/qlexpr/index.h"
 
@@ -67,7 +69,9 @@ class MultiStageAlterTable {
   // INDEX_PERM_DELETE_ONLY -> INDEX_PERM_WRITE_AND_DELETE -> BACKFILL
   static Status LaunchNextTableInfoVersionIfNecessary(
       CatalogManager* mgr, const scoped_refptr<TableInfo>& Info, uint32_t current_version,
-      const LeaderEpoch& epoch, bool respect_backfill_deferrals = true,
+      const LeaderEpoch& epoch,
+      std::optional<TransactionMetadata> requester_transaction,
+      bool respect_backfill_deferrals = true,
       bool update_ysql_to_backfill = false);
 
   // Clears the fully_applied_* state for the given table and optionally sets it to RUNNING.
@@ -94,10 +98,13 @@ class MultiStageAlterTable {
 
  private:
   // Start Index Backfill process/step for the specified table/index.
+  // If requester_transaction is provided it will be used to monitor the liveness of the
+  // PG backend that initiated the backfill.
   static Status StartBackfillingData(
       CatalogManager* catalog_manager, const scoped_refptr<TableInfo>& indexed_table,
       const std::vector<IndexInfoPB>& idx_infos, std::optional<uint32_t> expected_version,
-      const LeaderEpoch& epoch);
+      const LeaderEpoch& epoch,
+      std::optional<TransactionMetadata> requester_transaction);
 };
 
 class BackfillTablet;
@@ -112,7 +119,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> {
                 const scoped_refptr<TableInfo> &indexed_table,
                 std::vector<IndexInfoPB> indexes,
                 const scoped_refptr<NamespaceInfo> &ns_info,
-                LeaderEpoch epoch);
+                LeaderEpoch epoch,
+                std::optional<TransactionMetadata> requester_transaction);
 
   Status Launch();
 
@@ -169,6 +177,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> {
 
   static void UnsetIndexTableRetainsDeleteMarkers(PersistentTableInfo* index_table);
 
+  Status Abort();
+
  private:
   void LaunchBackfillOrAbort();
   Status WaitForTabletSplitting();
@@ -188,7 +198,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> {
   Status AlterTableStateToAbort();
   Status AlterTableStateToSuccess();
 
-  Status Abort();
+  void StartRequesterLivenessMonitor();
+  void StopLivenessMonitor();
   Status CheckIfDone();
   Status UpdateIndexPermissionsForIndexes();
   Status ClearCheckpointStateInTablets();
@@ -230,8 +241,11 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> {
   const scoped_refptr<NamespaceInfo> ns_info_;
   LeaderEpoch epoch_;
   ash::WaitStateInfoPtr wait_state_;
+  std::optional<TransactionMetadata> requester_transaction_;
+  std::shared_ptr<DdlRequesterLivenessTask> liveness_task_ GUARDED_BY(mutex_);
 };
 
+
 class BackfillTableJob : public server::MonitoredTask {
  public:
   explicit BackfillTableJob(std::shared_ptr<BackfillTable> backfill_table)

diff --git a/src/yb/master/catalog_entity_info.h b/src/yb/master/catalog_entity_info.h
@@ -874,6 +874,30 @@ class TableInfo : public RefCountedThreadSafe<TableInfo>,
     is_backfilling_ = false;
   }
 
+  // Store/retrieve the DDL transaction from the PG backend that initiated the backfill.
+  // Stored when CatalogManager::BackfillIndex moves the index from WRITE_AND_DELETE to
+  // DO_BACKFILL; retrieved when StartBackfillingData actually creates the BackfillTable.
+  // schema_version must be the table version produced by the permission update (current + 1).
+  void SetPendingBackfillRequesterTransaction(
+      std::optional<TransactionMetadata> txn, uint32_t schema_version) {
+    std::lock_guard l(lock_);
+    pending_backfill_requester_transaction_ = std::move(txn);
+    pending_backfill_requester_transaction_version_ = schema_version;
+  }
+
+  // Returns the stored transaction and clears it, but only if schema_version matches the value
+  // passed to SetPendingBackfillRequesterTransaction.  Returns nullopt otherwise so that a stale
+  // transaction from an earlier backfill attempt is never used for a later one.
+  std::optional<TransactionMetadata> TakePendingBackfillRequesterTransaction(
+      uint32_t schema_version) {
+    std::lock_guard l(lock_);
+    if (!pending_backfill_requester_transaction_ ||
+        pending_backfill_requester_transaction_version_ != schema_version) {
+      return std::nullopt;
+    }
+    return std::exchange(pending_backfill_requester_transaction_, std::nullopt);
+  }
+
   // Returns true if an "Alter" operation is in-progress.
   Result<bool> IsAlterInProgress(uint32_t version) const;
 
@@ -985,6 +1009,12 @@ class TableInfo : public RefCountedThreadSafe<TableInfo>,
   // In memory state set during backfill to prevent multiple backfill jobs.
   bool is_backfilling_ = false;
 
+  // DDL transaction from the PG backend that initiated the backfill, and the table schema version
+  // at which it was stored. Set when BackfillIndex updates permissions (WRITE_AND_DELETE ->
+  // DO_BACKFILL) and cleared when StartBackfillingData creates the BackfillTable.
+  std::optional<TransactionMetadata> pending_backfill_requester_transaction_ GUARDED_BY(lock_);
+  uint32_t pending_backfill_requester_transaction_version_ GUARDED_BY(lock_) = 0;
+
   std::atomic<bool> is_system_{false};
 
   const bool colocated_;