Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/yb/client/client-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,7 @@ Status YBClient::Data::BackfillIndex(YBClient* client,
const YBTableName& index_name,
const TableId& index_id,
CoarseTimePoint deadline,
std::optional<TransactionMetadata> requester_transaction,
bool wait) {
BackfillIndexRequestPB req;
BackfillIndexResponsePB resp;
Expand All @@ -1000,6 +1001,9 @@ Status YBClient::Data::BackfillIndex(YBClient* client,
if (!index_id.empty()) {
req.mutable_index_identifier()->set_table_id(index_id);
}
if (requester_transaction.has_value()) {
requester_transaction->ToPB(req.mutable_requester_transaction());
}

RETURN_NOT_OK((SyncLeaderMasterRpc(
deadline, req, &resp, "BackfillIndex", &master::MasterDdlProxy::BackfillIndexAsync)));
Expand Down
1 change: 1 addition & 0 deletions src/yb/client/client-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ class YBClient::Data {
const YBTableName& table_name,
const TableId& table_id,
CoarseTimePoint deadline,
std::optional<TransactionMetadata> requester_transaction,
bool wait = true);
Status IsBackfillIndexInProgress(YBClient* client,
const TableId& table_id,
Expand Down
7 changes: 5 additions & 2 deletions src/yb/client/client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -622,11 +622,14 @@ Status YBClient::TruncateTables(const TableIds& table_ids, bool wait) {
return data_->TruncateTables(this, table_ids, deadline, wait);
}

Status YBClient::BackfillIndex(const TableId& table_id, bool wait, CoarseTimePoint deadline) {
Status YBClient::BackfillIndex(const TableId& table_id,
std::optional<TransactionMetadata> requester_transaction,
bool wait, CoarseTimePoint deadline) {
if (deadline == CoarseTimePoint()) {
deadline = CoarseMonoClock::Now() + FLAGS_backfill_index_client_rpc_timeout_ms * 1ms;
}
return data_->BackfillIndex(this, YBTableName(), table_id, deadline, wait);
return data_->BackfillIndex(
this, YBTableName(), table_id, deadline, std::move(requester_transaction), wait);
}

Status YBClient::GetIndexBackfillProgress(
Expand Down
5 changes: 4 additions & 1 deletion src/yb/client/client.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,10 @@ class YBClient {

// Backfill the specified index table. This is only supported for YSQL at the moment.
Status BackfillIndex(
const TableId& table_id, bool wait = true, CoarseTimePoint deadline = CoarseTimePoint());
const TableId& table_id,
std::optional<TransactionMetadata> requester_transaction,
bool wait = true,
CoarseTimePoint deadline = CoarseTimePoint());

Status GetIndexBackfillProgress(
const TableIds& index_ids,
Expand Down
4 changes: 3 additions & 1 deletion src/yb/client/client_master_rpc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ void ClientMasterRpcBase::Finished(const Status& status) {
}

if (new_status.IsNetworkError() || new_status.IsRemoteError()) {
if (rpc::RpcError(new_status) != rpc::ErrorStatusPB::ERROR_NO_SUCH_METHOD) {
const auto rpc_error = rpc::RpcError(new_status);
if (rpc_error != rpc::ErrorStatusPB::ERROR_NO_SUCH_METHOD &&
rpc_error != rpc::ErrorStatusPB::FATAL_SERVER_SHUTTING_DOWN) {
Comment thread
egladysh marked this conversation as resolved.
LOG(WARNING) << ToString() << ": Encountered a network error from the Master("
<< client_data_->leader_master_hostport().ToString()
<< "): " << new_status.ToString() << ", retrying...";
Expand Down
82 changes: 76 additions & 6 deletions src/yb/master/backfill_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ DEFINE_test_flag(bool, skip_index_backfill, false,
DEFINE_test_flag(bool, block_do_backfill, false,
"Block DoBackfill from proceeding.");

DEFINE_test_flag(bool, skip_ddl_requester_liveness_check, false,
"Skip starting the requester liveness task. Used in tests to simulate the pre-fix behavior "
"where master continues sending BackfillIndex RPCs after the backend is killed.");

DEFINE_test_flag(bool, simulate_empty_indexes_during_backfill, false,
"Simulates BackfillTable::indexes_to_build() to return an empty set.");

Expand Down Expand Up @@ -324,7 +328,8 @@ Status MultiStageAlterTable::StartBackfillingData(
CatalogManager* catalog_manager,
const scoped_refptr<TableInfo>& indexed_table,
const std::vector<IndexInfoPB>& idx_infos,
std::optional<uint32_t> current_version, const LeaderEpoch& epoch) {
std::optional<uint32_t> current_version, const LeaderEpoch& epoch,
std::optional<TransactionMetadata> requester_transaction) {
// We leave the table state as ALTERING so that a master failover can resume the backfill.
RETURN_NOT_OK(ClearFullyAppliedAndUpdateState(
catalog_manager, indexed_table, current_version, /* change_state to RUNNING */ false, epoch));
Expand All @@ -337,6 +342,14 @@ Status MultiStageAlterTable::StartBackfillingData(
VLOG(0) << __func__ << " starting backfill on " << indexed_table->ToString() << " for "
<< yb::ToString(idx_infos);

// Retrieve the requester transaction if it was stored during the permission-update phase.
// Pass current_version so TakePendingBackfillRequesterTransaction rejects stale
// transactions from earlier backfill attempts.
if (!requester_transaction && current_version) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!requester_transaction && current_version) {
if (current_version) {

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the code is wrong. It guards against the case when the master has no in memory transaction. It's related to my previous comment about nullopt if it holds.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I should have clarified that this suggestion was contingent on #31378 (comment) being true. I believe it is true, but it is not a blocker to me to have this dead code.

requester_transaction =
indexed_table->TakePendingBackfillRequesterTransaction(*current_version);
}

if (FLAGS_TEST_skip_index_backfill) {
TRACE("Skipping backfill of data on tservers");
LOG(INFO) << "Skipping backfill of data on tservers";
Expand All @@ -345,7 +358,7 @@ Status MultiStageAlterTable::StartBackfillingData(

auto backfill_table = std::make_shared<BackfillTable>(
catalog_manager->master_, catalog_manager->AsyncTaskPool(), indexed_table, idx_infos,
*ns_info, epoch);
*ns_info, epoch, std::move(requester_transaction));
Status s = backfill_table->Launch();
if (!s.ok()) {
indexed_table->ClearIsBackfilling();
Expand Down Expand Up @@ -387,7 +400,8 @@ IndexPermissions NextPermission(IndexPermissions perm) {

Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(
CatalogManager* catalog_manager, const scoped_refptr<TableInfo>& indexed_table,
uint32_t current_version, const LeaderEpoch& epoch, bool respect_backfill_deferrals,
uint32_t current_version, const LeaderEpoch& epoch,
std::optional<TransactionMetadata> requester_transaction, bool respect_backfill_deferrals,
bool update_ysql_to_backfill) {
DVLOG_WITH_FUNC(3)
<< Format("$0, version: $1, respect_deferrals: $2, update_ysql_to_backfill: $3",
Expand Down Expand Up @@ -502,6 +516,15 @@ Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(

if (permissions_updated.ok() && *permissions_updated) {
VLOG(1) << "Sending alter table request with updated permissions";
// Store the requester transaction so StartBackfillingData can retrieve it when the
// permission change reaches DO_BACKFILL and the second call launches backfill.
// Store current_version+1 (the new version after this permission update)
// so TakePendingBackfillRequesterTransaction can verify the transaction
// belongs to this exact backfill attempt and not a stale one.
if (requester_transaction) {
indexed_table->SetPendingBackfillRequesterTransaction(
std::move(requester_transaction), current_version + 1);
}
RETURN_NOT_OK(catalog_manager->SendAlterTableRequest(indexed_table, epoch));
return Status::OK();
}
Expand Down Expand Up @@ -530,7 +553,8 @@ Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(
}
WARN_NOT_OK(
StartBackfillingData(
catalog_manager, indexed_table.get(), indexes_to_backfill, current_version, epoch),
catalog_manager, indexed_table.get(), indexes_to_backfill, current_version, epoch,
std::move(requester_transaction)),
Comment thread
egladysh marked this conversation as resolved.
yb::Format("Could not launch backfill for $0", indexed_table->ToString()));
}

Expand Down Expand Up @@ -627,7 +651,7 @@ std::string RetrieveIndexNames(CatalogManager* mgr,
BackfillTable::BackfillTable(
Master* master, ThreadPool* callback_pool, const scoped_refptr<TableInfo>& indexed_table,
std::vector<IndexInfoPB> indexes, const scoped_refptr<NamespaceInfo>& ns_info,
LeaderEpoch epoch)
LeaderEpoch epoch, std::optional<TransactionMetadata> requester_transaction)
Comment thread
egladysh marked this conversation as resolved.
: master_(master),
callback_pool_(callback_pool),
indexed_table_(indexed_table),
Expand All @@ -637,7 +661,8 @@ BackfillTable::BackfillTable(
RetrieveIndexNames(master->catalog_manager_impl(), requested_index_ids_)),
ns_info_(ns_info),
epoch_(std::move(epoch)),
wait_state_(ash::WaitStateInfo::CreateIfAshIsEnabled<ash::WaitStateInfo>()) {
wait_state_(ash::WaitStateInfo::CreateIfAshIsEnabled<ash::WaitStateInfo>()),
requester_transaction_(std::move(requester_transaction)) {
if (wait_state_) {
if (const auto& current_state = ash::WaitStateInfo::CurrentWaitState()) {
wait_state_->UpdateMetadata(current_state->metadata());
Expand Down Expand Up @@ -951,6 +976,7 @@ Status BackfillTable::DoLaunchBackfill() {
}

Status BackfillTable::DoBackfill() {
StartRequesterLivenessMonitor();
while (FLAGS_TEST_block_do_backfill) {
constexpr auto kSpinWait = 100ms;
LOG(INFO) << Format("Blocking $0 for $1", __func__, kSpinWait);
Expand Down Expand Up @@ -984,6 +1010,7 @@ Status BackfillTable::Done(const Status& s, const std::unordered_set<TableId>& f
if (!done() && --tablets_pending_ == 0) {
LOG_WITH_PREFIX(INFO) << "Completed backfilling the index table.";
done_.store(true, std::memory_order_release);
StopLivenessMonitor();
RETURN_NOT_OK_PREPEND(
MarkAllIndexesAsSuccess(), "Failed to mark indexes as successfully backfilled.");
RETURN_NOT_OK_PREPEND(UpdateIndexPermissionsForIndexes(), "Failed to complete backfill.");
Expand All @@ -997,6 +1024,7 @@ Status BackfillTable::MarkIndexesAsFailed(
const std::unordered_set<TableId>& failed_indexes, const string& message) {
if (indexes_to_build() == failed_indexes) {
done_.store(true, std::memory_order_release);
StopLivenessMonitor();
Comment thread
egladysh marked this conversation as resolved.
backfill_job_->SetState(MonitoredTaskState::kFailed);
}
return MarkIndexesAsDesired(failed_indexes, BackfillJobPB::FAILED, message);
Expand Down Expand Up @@ -1077,6 +1105,47 @@ Status BackfillTable::MarkIndexesAsDesired(
return Status::OK();
}

void BackfillTable::StartRequesterLivenessMonitor() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After some back-and-forth with AI, got this (which I haven't fully verified in the interest of time):

Review of StartRequesterLivenessMonitor and StopLivenessMonitor

StartRequesterLivenessMonitor

Issue 1: Race between CreateAndStartTask and storing liveness_task_

The task is created and started at line 1120, but not stored into liveness_task_ until line 1131. The task begins polling immediately. If the transaction is already aborted (or aborts very quickly), the task's FinishPollTransaction fires abort_()BackfillTable::Abort()MarkAllIndexesAsFailed()CheckIfDone()StopLivenessMonitor() before liveness_task_ is assigned. StopLivenessMonitor sees null and does nothing.

In this specific case the task happens to have self-completed via Complete() before calling abort_(), so there's no leak. But the correctness depends on an implementation detail of FinishPollTransaction's ordering (Complete() before abort_()) — if that ordering ever changes, this breaks silently.

The fix is straightforward: create the task without starting it, store it under the lock, then start it:

auto task = std::make_shared<DdlRequesterLivenessTask>(...);
{
  std::lock_guard l(mutex_);
  DCHECK(!liveness_task_);
  liveness_task_ = task;
}
task->Start();

This would require exposing a two-phase create+start API on DdlRequesterLivenessTask (the current CreateAndStartTask bundles both).

Issue 2: No error handling on CreateAndStartTask

CreateAndStartTask returns a shared_ptr, not a Result. If Start() fails internally (threadpool full, task immediately aborted by ValidateRunnable, etc.), the caller has no way to know the liveness monitor is non-functional. The entire feature becomes silently disabled with no log message indicating why.

At minimum, CreateAndStartTask should return Result<shared_ptr<DdlRequesterLivenessTask>> or the caller should verify the task's state after creation. Alternatively, log a warning if the task is in a terminal state immediately after start.

Issue 3: Callback captures shared_ptr<BackfillTable>

The lambdas capture self via shared_from_this(). This prevents BackfillTable destruction while the liveness task is alive. If StopLivenessMonitor is never called (e.g. a code path that sets done_ without going through the normal terminal paths), the BackfillTable leaks along with the task. Today's call sites appear to cover all terminal paths, but this is fragile — a future change that adds a new exit path could miss the StopLivenessMonitor call.

Consider weak_ptr for the callbacks, or a mechanism where the task self-terminates when done_() returns true (which ValidateRunnable already does — but only on the next scheduled step, not immediately).

StopLivenessMonitor

Good: Lock discipline avoids deadlock

Moving the task out via std::move under mutex_ then calling AbortAndReturnPrevState outside the lock is correct. The abort path calls PerformAbort()Shutdown()sync_.Wait(). If this were done under mutex_, and the task's in-flight callback tried to re-enter BackfillTable (which acquires mutex_), you'd deadlock.

Good: Idempotency via std::move

After std::move, liveness_task_ is null; subsequent calls are no-ops. This is essential because StopLivenessMonitor is called from multiple convergent terminal paths: Done() (success), MarkIndexesAsFailed() (failure), CheckIfDone() (via Abort()).

Concern: AbortAndReturnPrevState may block

AbortAndReturnPrevState can trigger PerformAbort()Shutdown()sync_.Wait(), which blocks until the in-flight GetTransactionStatus RPC completes. This is called from Done() and CheckIfDone(), which run on the callback threadpool. A slow or hung transaction status RPC could stall the backfill completion path. The transaction_rpc_timeout_ms flag bounds this, but it's worth being aware of.

Summary of actionable items

  1. Fix the create-then-store race — separate creation from starting, or at minimum document why the current ordering is safe and what invariants it depends on.
  2. Add error handling for task creation — at minimum log when the liveness monitor fails to start.
  3. Consider the blocking potential of StopLivenessMonitorsync_.Wait() can stall the callback threadpool.

if (!requester_transaction_) {
return;
}
if (PREDICT_FALSE(FLAGS_TEST_skip_ddl_requester_liveness_check)) {
LOG_WITH_PREFIX(INFO) << "Skipping requester liveness monitor (TEST flag set)";
return;
}
VLOG_WITH_PREFIX(1) << "Starting requester liveness monitor for transaction "
<< requester_transaction_->transaction_id;

auto self = shared_from_this();
BackgroundDdlCallbacks callbacks{
.done_ = [self] { return self->done(); },
.abort_ = [self] { return self->Abort(); },
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem like BackfillTable::Abort / BackfillTable::Done are ready to be called in a multi threaded context once we add this callback.

  1. txn poll can call Abort while backfilltablet is causing a transition to Done success path.
  2. txn poll can call Abort while backfilltablet is causing its own failure transition.

It seems like so far they were able to use std atomics to avoid real locking but now it would be better to use a proper lock to keep it simple. we can have some explicit internal enum state like waiting, aborting, aborted, success and use that to decide what to do from the callbacks (we only want to affect waiting state from the txn callback and not the others). Any other approaches are also ok but current path seems prone to problems.

Copy link
Copy Markdown
Collaborator Author

@egladysh egladysh May 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iSignal Ah... I assumed that those atomics and mutex's were there to make them thread-safe. I do see the gap now. We can fix it with an enum (kind of a state machine you suggested) or just moving the done_ usage around like:

Status BackfillTable::Abort() {
      bool expected = false;
      if (!done_.compare_exchange_strong(expected, true)) {
          return Status::OK();
      }
      ...

Which one would you prefer?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done_ would be simpler but does not handle race between Abort and tablet Done failure path right? Both may try to mark indexes as failed. I guess an atomic int enum CAS with more than true/false can help distinguish the different states.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I amn't sure. It seems like indexes_to_build() takes care of it with LockForWrite. Transition failed to failed is harmless and after failed indexes_to_build() will return {} if my understanding is correct?

};
auto task = DdlRequesterLivenessTask::CreateAndStartTask(
*master_->catalog_manager_impl(),
indexed_table_,
*requester_transaction_,
std::move(callbacks),
master_->client_future(),
*master_->messenger(),
epoch_);

std::lock_guard l(mutex_);
DCHECK(!liveness_task_) << "Liveness task already exists";
liveness_task_ = std::move(task);
}

void BackfillTable::StopLivenessMonitor() {
std::shared_ptr<DdlRequesterLivenessTask> task;
{
std::lock_guard l(mutex_);
task = std::move(liveness_task_);
}
if (task) {
task->AbortAndReturnPrevState(STATUS(Aborted, "BackfillTable is done"));
}
}

Status BackfillTable::Abort() {
LOG(WARNING) << "Backfill failed/aborted.";
RETURN_NOT_OK(MarkAllIndexesAsFailed());
Expand All @@ -1086,6 +1155,7 @@ Status BackfillTable::Abort() {
Status BackfillTable::CheckIfDone() {
if (indexes_to_build().empty()) {
done_.store(true, std::memory_order_release);
StopLivenessMonitor();
RETURN_NOT_OK_PREPEND(
UpdateIndexPermissionsForIndexes(),
"Could not update index permissions after backfill");
Expand Down
22 changes: 18 additions & 4 deletions src/yb/master/backfill_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@

#include "yb/ash/wait_state.h"
#include "yb/common/entity_ids.h"
#include "yb/common/transaction.h"
#include "yb/dockv/partition.h"

#include "yb/gutil/integral_types.h"
#include "yb/gutil/ref_counted.h"

#include "yb/master/async_rpc_tasks_base.h"
#include "yb/master/catalog_entity_info.h"
#include "yb/master/ysql_ddl_verification_task.h"

#include "yb/qlexpr/index.h"

Expand Down Expand Up @@ -67,7 +69,9 @@ class MultiStageAlterTable {
// INDEX_PERM_DELETE_ONLY -> INDEX_PERM_WRITE_AND_DELETE -> BACKFILL
static Status LaunchNextTableInfoVersionIfNecessary(
CatalogManager* mgr, const scoped_refptr<TableInfo>& Info, uint32_t current_version,
const LeaderEpoch& epoch, bool respect_backfill_deferrals = true,
const LeaderEpoch& epoch,
std::optional<TransactionMetadata> requester_transaction,
bool respect_backfill_deferrals = true,
bool update_ysql_to_backfill = false);

// Clears the fully_applied_* state for the given table and optionally sets it to RUNNING.
Expand All @@ -94,10 +98,13 @@ class MultiStageAlterTable {

private:
// Start Index Backfill process/step for the specified table/index.
// If requester_transaction is provided it will be used to monitor the liveness of the
// PG backend that initiated the backfill.
static Status StartBackfillingData(
CatalogManager* catalog_manager, const scoped_refptr<TableInfo>& indexed_table,
const std::vector<IndexInfoPB>& idx_infos, std::optional<uint32_t> expected_version,
const LeaderEpoch& epoch);
const LeaderEpoch& epoch,
std::optional<TransactionMetadata> requester_transaction);
};

class BackfillTablet;
Expand All @@ -112,7 +119,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> {
const scoped_refptr<TableInfo> &indexed_table,
std::vector<IndexInfoPB> indexes,
const scoped_refptr<NamespaceInfo> &ns_info,
LeaderEpoch epoch);
LeaderEpoch epoch,
std::optional<TransactionMetadata> requester_transaction);

Status Launch();

Expand Down Expand Up @@ -169,6 +177,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> {

static void UnsetIndexTableRetainsDeleteMarkers(PersistentTableInfo* index_table);

Status Abort();

private:
void LaunchBackfillOrAbort();
Status WaitForTabletSplitting();
Expand All @@ -188,7 +198,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> {
Status AlterTableStateToAbort();
Status AlterTableStateToSuccess();

Status Abort();
void StartRequesterLivenessMonitor();
void StopLivenessMonitor();
Status CheckIfDone();
Status UpdateIndexPermissionsForIndexes();
Status ClearCheckpointStateInTablets();
Expand Down Expand Up @@ -230,8 +241,11 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> {
const scoped_refptr<NamespaceInfo> ns_info_;
LeaderEpoch epoch_;
ash::WaitStateInfoPtr wait_state_;
std::optional<TransactionMetadata> requester_transaction_;
std::shared_ptr<DdlRequesterLivenessTask> liveness_task_ GUARDED_BY(mutex_);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this can be a std::weak_ptr? The task runs on its own, do we need to own it? Right now, both the task and backfilltable are holding refs to each other, so we really need to be sure we release both correctly.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iSignal I think that the reference cycle is already explicitly broken because every exit path (MarkIndexesAsFailed, CheckIfDone) calls StopLivenessMonitor(), which moves liveness_task_ out and clears BackfillTable's reference to the task. Anther reference to the task is the TableInfo's task list and the task will hold the last shared_ptr<BackfillTable> because the captures in the callbacks, and when the task finishes those are released too. No leak. Also I think that weak_ptr might actually be wrong because StopLivenessMonitor needs to call AbortAndReturnPrevState on the task but it depends on the life cycle of the tasks in TableInfo. I feel like shared_ptr is safer but I could be wrong.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it is safe now but it is a bit worrying that every future path would need to reason about and remember to call Stop... during Job termination paths to break the loop.

If we write it as below to get a shared ptr out of the weak ptr, it would allow the task to exit by itself as well. But open to other suggestions as well

void BackfillTable::StopLivenessMonitor() {
  std::shared_ptr<DdlRequesterLivenessTask> task;
  {
    std::lock_guard l(mutex_);
    task = liveness_task_.lock();
    liveness_task_.reset();
  }
  if (task) {
    task->AbortAndReturnPrevState(STATUS(Aborted, "BackfillTable is done"));
  }
}

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My concern is about AbortAndReturnPrevState. I thought the call must be made? If that's not the case, I'd agree that weak_ptr would be a better choice.

};


class BackfillTableJob : public server::MonitoredTask {
public:
explicit BackfillTableJob(std::shared_ptr<BackfillTable> backfill_table)
Expand Down
30 changes: 30 additions & 0 deletions src/yb/master/catalog_entity_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,30 @@ class TableInfo : public RefCountedThreadSafe<TableInfo>,
is_backfilling_ = false;
}

// Store/retrieve the DDL transaction from the PG backend that initiated the backfill.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After some back-and-forth with AI, got this (which I haven't fully verified in the interest of time):

Review of SetPendingBackfillRequesterTransaction and TakePendingBackfillRequesterTransaction

Note: Not persisted across master failover

Already tracked in #31472 and commented at https://github.com/yugabyte/yugabyte-db/pull/31378/changes#r3198264572. Not repeating here.

Note: Take returning nullopt is expected in multiple cases

StartBackfillingData always calls Take when !requester_transaction && current_version (line 348). Nullopt is the normal result for:

  • YCQL: Set is never called — YCQL doesn't go through CatalogManager::BackfillIndex (which is PGSQL-only). Nullopt is the expected baseline.
  • YSQL without a requester transaction: older PG clients that don't send requester_transaction, or decode failure (line 6594).
  • YSQL after master failover: in-memory state lost, already tracked in [YSQL] Create BackfillJobPB earlier in the backfill lifecycle #31472.

The only scenario where nullopt from Take would indicate a problem is a version mismatch — Set was called at V+1 but Take is called at some other version. This would mean an unexpected version bump occurred between the permission update and the backfill launch. This is unlikely today (YSQL does exactly one bump), but there's no way to distinguish this case from the legitimate nullopt cases at the Take call site.

Not flagging this as actionable, but noting that debugging a missing liveness monitor will require correlating logs from the Set call site (which currently has no log) with the Take call site. A VLOG at the Set call (line 523) recording the stored version would help.

Issue 1: Version encoding assumes exactly one version bump between Set and Take

Set stores at current_version + 1. Take is called with the table's version at the time backfill is launched. This works because YSQL does exactly one permission update (WRITE_AND_DELETE → DO_BACKFILL) which bumps the version by exactly 1.

If a future change introduces additional intermediate permission steps or version bumps between Set and Take, the versions would mismatch and Take would silently return nullopt, disabling liveness monitoring. The version matching is correct today but the coupling is implicit. A comment on SetPendingBackfillRequesterTransaction noting this single-bump assumption would help.

Issue 2: Transaction decode failure is only a WARNING

In CatalogManager::BackfillIndex (lines 6590-6596):

if (req->has_requester_transaction()) {
    auto result = TransactionMetadata::FromPB(req->requester_transaction());
    if (result.ok()) {
      requester_txn = std::move(*result);
    } else {
      LOG(WARNING) << "BackfillIndex: failed to decode requester transaction: " << result.status();
    }
}

If the PG backend sends a malformed transaction, the decode fails and is logged as a WARNING. The backfill proceeds without liveness monitoring. This is fine for robustness (don't block backfill for a monitoring feature), but the WARNING could be easy to miss. Consider LOG(DFATAL) in debug builds to catch protocol bugs early.

Issue 3: Method bodies in the header file

SetPendingBackfillRequesterTransaction and TakePendingBackfillRequesterTransaction are defined inline in catalog_entity_info.h. Most TableInfo methods with comparable complexity (SetIsBackfilling, SetCreateTableErrorStatus, etc.) are declared in the header but defined in catalog_entity_info.cc. ClearIsBackfilling is inline but is a trivial one-liner. These two methods have lock acquisition, conditional logic, and std::exchange — they should follow the prevailing pattern and move to the .cc file.

Summary of actionable items

  1. Comment the single-version-bump assumption — at the Set call site (line 523-524) or on the field declaration.
  2. Add a VLOG at the Set call site (line 523) recording the stored version, to aid debugging when the liveness monitor unexpectedly doesn't start.
  3. Consider LOG(DFATAL) for transaction decode failure — at line 6594-6596, a malformed requester_transaction from the PG client is only a WARNING. LOG(DFATAL) would catch protocol bugs in debug builds.
  4. Move Set/Take method bodies to catalog_entity_info.cc — they have non-trivial logic and don't match the header-inline pattern used by comparable TableInfo methods.

// Stored when CatalogManager::BackfillIndex moves the index from WRITE_AND_DELETE to
// DO_BACKFILL; retrieved when StartBackfillingData actually creates the BackfillTable.
// schema_version must be the table version produced by the permission update (current + 1).
void SetPendingBackfillRequesterTransaction(
Comment thread
egladysh marked this conversation as resolved.
std::optional<TransactionMetadata> txn, uint32_t schema_version) {
std::lock_guard l(lock_);
pending_backfill_requester_transaction_ = std::move(txn);
pending_backfill_requester_transaction_version_ = schema_version;
}

// Returns the stored transaction and clears it, but only if schema_version matches the value
// passed to SetPendingBackfillRequesterTransaction. Returns nullopt otherwise so that a stale
// transaction from an earlier backfill attempt is never used for a later one.
std::optional<TransactionMetadata> TakePendingBackfillRequesterTransaction(
uint32_t schema_version) {
std::lock_guard l(lock_);
if (!pending_backfill_requester_transaction_ ||
pending_backfill_requester_transaction_version_ != schema_version) {
return std::nullopt;
}
return std::exchange(pending_backfill_requester_transaction_, std::nullopt);
}

// Returns true if an "Alter" operation is in-progress.
Result<bool> IsAlterInProgress(uint32_t version) const;

Expand Down Expand Up @@ -985,6 +1009,12 @@ class TableInfo : public RefCountedThreadSafe<TableInfo>,
// In memory state set during backfill to prevent multiple backfill jobs.
bool is_backfilling_ = false;

// DDL transaction from the PG backend that initiated the backfill, and the table schema version
// at which it was stored. Set when BackfillIndex updates permissions (WRITE_AND_DELETE ->
// DO_BACKFILL) and cleared when StartBackfillingData creates the BackfillTable.
std::optional<TransactionMetadata> pending_backfill_requester_transaction_ GUARDED_BY(lock_);
uint32_t pending_backfill_requester_transaction_version_ GUARDED_BY(lock_) = 0;

std::atomic<bool> is_system_{false};

const bool colocated_;
Expand Down
Loading
Loading