-
Notifications
You must be signed in to change notification settings - Fork 1.3k
[#31263] YSQL: Stop index backfill when the CREATE INDEX session is terminated #31378
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
4b38b00
f2675a3
028994f
ebaafb3
8b620dc
b8f2335
f5f5f28
b707b74
a489127
9140fce
69826e4
d2aa0c8
8d8680c
16646e9
7c3c66d
c5b9ba9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -119,6 +119,10 @@ DEFINE_test_flag(bool, skip_index_backfill, false, | |||||
| DEFINE_test_flag(bool, block_do_backfill, false, | ||||||
| "Block DoBackfill from proceeding."); | ||||||
|
|
||||||
| DEFINE_test_flag(bool, skip_ddl_requester_liveness_check, false, | ||||||
| "Skip starting the requester liveness task. Used in tests to simulate the pre-fix behavior " | ||||||
| "where master continues sending BackfillIndex RPCs after the backend is killed."); | ||||||
|
|
||||||
| DEFINE_test_flag(bool, simulate_empty_indexes_during_backfill, false, | ||||||
| "Simulates BackfillTable::indexes_to_build() to return an empty set."); | ||||||
|
|
||||||
|
|
@@ -324,7 +328,8 @@ Status MultiStageAlterTable::StartBackfillingData( | |||||
| CatalogManager* catalog_manager, | ||||||
| const scoped_refptr<TableInfo>& indexed_table, | ||||||
| const std::vector<IndexInfoPB>& idx_infos, | ||||||
| std::optional<uint32_t> current_version, const LeaderEpoch& epoch) { | ||||||
| std::optional<uint32_t> current_version, const LeaderEpoch& epoch, | ||||||
| std::optional<TransactionMetadata> requester_transaction) { | ||||||
| // We leave the table state as ALTERING so that a master failover can resume the backfill. | ||||||
| RETURN_NOT_OK(ClearFullyAppliedAndUpdateState( | ||||||
| catalog_manager, indexed_table, current_version, /* change_state to RUNNING */ false, epoch)); | ||||||
|
|
@@ -337,6 +342,14 @@ Status MultiStageAlterTable::StartBackfillingData( | |||||
| VLOG(0) << __func__ << " starting backfill on " << indexed_table->ToString() << " for " | ||||||
| << yb::ToString(idx_infos); | ||||||
|
|
||||||
| // Retrieve the requester transaction if it was stored during the permission-update phase. | ||||||
| // Pass current_version so TakePendingBackfillRequesterTransaction rejects stale | ||||||
| // transactions from earlier backfill attempts. | ||||||
| if (!requester_transaction && current_version) { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think the code is wrong. It guards against the case when the master has no in memory transaction. It's related to my previous comment about nullopt if it holds.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I should have clarified that this suggestion was contingent on #31378 (comment) being true. I believe it is true, but it is not a blocker to me to have this dead code. |
||||||
| requester_transaction = | ||||||
| indexed_table->TakePendingBackfillRequesterTransaction(*current_version); | ||||||
| } | ||||||
|
|
||||||
| if (FLAGS_TEST_skip_index_backfill) { | ||||||
| TRACE("Skipping backfill of data on tservers"); | ||||||
| LOG(INFO) << "Skipping backfill of data on tservers"; | ||||||
|
|
@@ -345,7 +358,7 @@ Status MultiStageAlterTable::StartBackfillingData( | |||||
|
|
||||||
| auto backfill_table = std::make_shared<BackfillTable>( | ||||||
| catalog_manager->master_, catalog_manager->AsyncTaskPool(), indexed_table, idx_infos, | ||||||
| *ns_info, epoch); | ||||||
| *ns_info, epoch, std::move(requester_transaction)); | ||||||
| Status s = backfill_table->Launch(); | ||||||
| if (!s.ok()) { | ||||||
| indexed_table->ClearIsBackfilling(); | ||||||
|
|
@@ -387,7 +400,8 @@ IndexPermissions NextPermission(IndexPermissions perm) { | |||||
|
|
||||||
| Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary( | ||||||
| CatalogManager* catalog_manager, const scoped_refptr<TableInfo>& indexed_table, | ||||||
| uint32_t current_version, const LeaderEpoch& epoch, bool respect_backfill_deferrals, | ||||||
| uint32_t current_version, const LeaderEpoch& epoch, | ||||||
| std::optional<TransactionMetadata> requester_transaction, bool respect_backfill_deferrals, | ||||||
| bool update_ysql_to_backfill) { | ||||||
| DVLOG_WITH_FUNC(3) | ||||||
| << Format("$0, version: $1, respect_deferrals: $2, update_ysql_to_backfill: $3", | ||||||
|
|
@@ -502,6 +516,15 @@ Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary( | |||||
|
|
||||||
| if (permissions_updated.ok() && *permissions_updated) { | ||||||
| VLOG(1) << "Sending alter table request with updated permissions"; | ||||||
| // Store the requester transaction so StartBackfillingData can retrieve it when the | ||||||
| // permission change reaches DO_BACKFILL and the second call launches backfill. | ||||||
| // Store current_version+1 (the new version after this permission update) | ||||||
| // so TakePendingBackfillRequesterTransaction can verify the transaction | ||||||
| // belongs to this exact backfill attempt and not a stale one. | ||||||
| if (requester_transaction) { | ||||||
| indexed_table->SetPendingBackfillRequesterTransaction( | ||||||
| std::move(requester_transaction), current_version + 1); | ||||||
| } | ||||||
| RETURN_NOT_OK(catalog_manager->SendAlterTableRequest(indexed_table, epoch)); | ||||||
| return Status::OK(); | ||||||
| } | ||||||
|
|
@@ -530,7 +553,8 @@ Status MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary( | |||||
| } | ||||||
| WARN_NOT_OK( | ||||||
| StartBackfillingData( | ||||||
| catalog_manager, indexed_table.get(), indexes_to_backfill, current_version, epoch), | ||||||
| catalog_manager, indexed_table.get(), indexes_to_backfill, current_version, epoch, | ||||||
| std::move(requester_transaction)), | ||||||
|
egladysh marked this conversation as resolved.
|
||||||
| yb::Format("Could not launch backfill for $0", indexed_table->ToString())); | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -627,7 +651,7 @@ std::string RetrieveIndexNames(CatalogManager* mgr, | |||||
| BackfillTable::BackfillTable( | ||||||
| Master* master, ThreadPool* callback_pool, const scoped_refptr<TableInfo>& indexed_table, | ||||||
| std::vector<IndexInfoPB> indexes, const scoped_refptr<NamespaceInfo>& ns_info, | ||||||
| LeaderEpoch epoch) | ||||||
| LeaderEpoch epoch, std::optional<TransactionMetadata> requester_transaction) | ||||||
|
egladysh marked this conversation as resolved.
|
||||||
| : master_(master), | ||||||
| callback_pool_(callback_pool), | ||||||
| indexed_table_(indexed_table), | ||||||
|
|
@@ -637,7 +661,8 @@ BackfillTable::BackfillTable( | |||||
| RetrieveIndexNames(master->catalog_manager_impl(), requested_index_ids_)), | ||||||
| ns_info_(ns_info), | ||||||
| epoch_(std::move(epoch)), | ||||||
| wait_state_(ash::WaitStateInfo::CreateIfAshIsEnabled<ash::WaitStateInfo>()) { | ||||||
| wait_state_(ash::WaitStateInfo::CreateIfAshIsEnabled<ash::WaitStateInfo>()), | ||||||
| requester_transaction_(std::move(requester_transaction)) { | ||||||
| if (wait_state_) { | ||||||
| if (const auto& current_state = ash::WaitStateInfo::CurrentWaitState()) { | ||||||
| wait_state_->UpdateMetadata(current_state->metadata()); | ||||||
|
|
@@ -951,6 +976,7 @@ Status BackfillTable::DoLaunchBackfill() { | |||||
| } | ||||||
|
|
||||||
| Status BackfillTable::DoBackfill() { | ||||||
| StartRequesterLivenessMonitor(); | ||||||
| while (FLAGS_TEST_block_do_backfill) { | ||||||
| constexpr auto kSpinWait = 100ms; | ||||||
| LOG(INFO) << Format("Blocking $0 for $1", __func__, kSpinWait); | ||||||
|
|
@@ -984,6 +1010,7 @@ Status BackfillTable::Done(const Status& s, const std::unordered_set<TableId>& f | |||||
| if (!done() && --tablets_pending_ == 0) { | ||||||
| LOG_WITH_PREFIX(INFO) << "Completed backfilling the index table."; | ||||||
| done_.store(true, std::memory_order_release); | ||||||
| StopLivenessMonitor(); | ||||||
| RETURN_NOT_OK_PREPEND( | ||||||
| MarkAllIndexesAsSuccess(), "Failed to mark indexes as successfully backfilled."); | ||||||
| RETURN_NOT_OK_PREPEND(UpdateIndexPermissionsForIndexes(), "Failed to complete backfill."); | ||||||
|
|
@@ -997,6 +1024,7 @@ Status BackfillTable::MarkIndexesAsFailed( | |||||
| const std::unordered_set<TableId>& failed_indexes, const string& message) { | ||||||
| if (indexes_to_build() == failed_indexes) { | ||||||
| done_.store(true, std::memory_order_release); | ||||||
| StopLivenessMonitor(); | ||||||
|
egladysh marked this conversation as resolved.
|
||||||
| backfill_job_->SetState(MonitoredTaskState::kFailed); | ||||||
| } | ||||||
| return MarkIndexesAsDesired(failed_indexes, BackfillJobPB::FAILED, message); | ||||||
|
|
@@ -1077,6 +1105,47 @@ Status BackfillTable::MarkIndexesAsDesired( | |||||
| return Status::OK(); | ||||||
| } | ||||||
|
|
||||||
| void BackfillTable::StartRequesterLivenessMonitor() { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After some back-and-forth with AI, got this (which I haven't fully verified in the interest of time): Review of
|
||||||
| if (!requester_transaction_) { | ||||||
| return; | ||||||
| } | ||||||
| if (PREDICT_FALSE(FLAGS_TEST_skip_ddl_requester_liveness_check)) { | ||||||
| LOG_WITH_PREFIX(INFO) << "Skipping requester liveness monitor (TEST flag set)"; | ||||||
| return; | ||||||
| } | ||||||
| VLOG_WITH_PREFIX(1) << "Starting requester liveness monitor for transaction " | ||||||
| << requester_transaction_->transaction_id; | ||||||
|
|
||||||
| auto self = shared_from_this(); | ||||||
| BackgroundDdlCallbacks callbacks{ | ||||||
| .done_ = [self] { return self->done(); }, | ||||||
| .abort_ = [self] { return self->Abort(); }, | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't seem like BackfillTable::Abort / BackfillTable::Done are ready to be called in a multi threaded context once we add this callback.
It seems like so far they were able to use std atomics to avoid real locking but now it would be better to use a proper lock to keep it simple. we can have some explicit internal enum state like waiting, aborting, aborted, success and use that to decide what to do from the callbacks (we only want to affect waiting state from the txn callback and not the others). Any other approaches are also ok but current path seems prone to problems.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @iSignal Ah... I assumed that those atomics and mutex's were there to make them thread-safe. I do see the gap now. We can fix it with an enum (kind of a state machine you suggested) or just moving the done_ usage around like: Which one would you prefer?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done_ would be simpler but does not handle race between Abort and tablet Done failure path right? Both may try to mark indexes as failed. I guess an atomic int enum CAS with more than true/false can help distinguish the different states.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I amn't sure. It seems like indexes_to_build() takes care of it with LockForWrite. Transition failed to failed is harmless and after failed indexes_to_build() will return {} if my understanding is correct? |
||||||
| }; | ||||||
| auto task = DdlRequesterLivenessTask::CreateAndStartTask( | ||||||
| *master_->catalog_manager_impl(), | ||||||
| indexed_table_, | ||||||
| *requester_transaction_, | ||||||
| std::move(callbacks), | ||||||
| master_->client_future(), | ||||||
| *master_->messenger(), | ||||||
| epoch_); | ||||||
|
|
||||||
| std::lock_guard l(mutex_); | ||||||
| DCHECK(!liveness_task_) << "Liveness task already exists"; | ||||||
| liveness_task_ = std::move(task); | ||||||
| } | ||||||
|
|
||||||
| void BackfillTable::StopLivenessMonitor() { | ||||||
| std::shared_ptr<DdlRequesterLivenessTask> task; | ||||||
| { | ||||||
| std::lock_guard l(mutex_); | ||||||
| task = std::move(liveness_task_); | ||||||
| } | ||||||
| if (task) { | ||||||
| task->AbortAndReturnPrevState(STATUS(Aborted, "BackfillTable is done")); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| Status BackfillTable::Abort() { | ||||||
| LOG(WARNING) << "Backfill failed/aborted."; | ||||||
| RETURN_NOT_OK(MarkAllIndexesAsFailed()); | ||||||
|
|
@@ -1086,6 +1155,7 @@ Status BackfillTable::Abort() { | |||||
| Status BackfillTable::CheckIfDone() { | ||||||
| if (indexes_to_build().empty()) { | ||||||
| done_.store(true, std::memory_order_release); | ||||||
| StopLivenessMonitor(); | ||||||
| RETURN_NOT_OK_PREPEND( | ||||||
| UpdateIndexPermissionsForIndexes(), | ||||||
| "Could not update index permissions after backfill"); | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,13 +27,15 @@ | |
|
|
||
| #include "yb/ash/wait_state.h" | ||
| #include "yb/common/entity_ids.h" | ||
| #include "yb/common/transaction.h" | ||
| #include "yb/dockv/partition.h" | ||
|
|
||
| #include "yb/gutil/integral_types.h" | ||
| #include "yb/gutil/ref_counted.h" | ||
|
|
||
| #include "yb/master/async_rpc_tasks_base.h" | ||
| #include "yb/master/catalog_entity_info.h" | ||
| #include "yb/master/ysql_ddl_verification_task.h" | ||
|
|
||
| #include "yb/qlexpr/index.h" | ||
|
|
||
|
|
@@ -67,7 +69,9 @@ class MultiStageAlterTable { | |
| // INDEX_PERM_DELETE_ONLY -> INDEX_PERM_WRITE_AND_DELETE -> BACKFILL | ||
| static Status LaunchNextTableInfoVersionIfNecessary( | ||
| CatalogManager* mgr, const scoped_refptr<TableInfo>& Info, uint32_t current_version, | ||
| const LeaderEpoch& epoch, bool respect_backfill_deferrals = true, | ||
| const LeaderEpoch& epoch, | ||
| std::optional<TransactionMetadata> requester_transaction, | ||
| bool respect_backfill_deferrals = true, | ||
| bool update_ysql_to_backfill = false); | ||
|
|
||
| // Clears the fully_applied_* state for the given table and optionally sets it to RUNNING. | ||
|
|
@@ -94,10 +98,13 @@ class MultiStageAlterTable { | |
|
|
||
| private: | ||
| // Start Index Backfill process/step for the specified table/index. | ||
| // If requester_transaction is provided it will be used to monitor the liveness of the | ||
| // PG backend that initiated the backfill. | ||
| static Status StartBackfillingData( | ||
| CatalogManager* catalog_manager, const scoped_refptr<TableInfo>& indexed_table, | ||
| const std::vector<IndexInfoPB>& idx_infos, std::optional<uint32_t> expected_version, | ||
| const LeaderEpoch& epoch); | ||
| const LeaderEpoch& epoch, | ||
| std::optional<TransactionMetadata> requester_transaction); | ||
| }; | ||
|
|
||
| class BackfillTablet; | ||
|
|
@@ -112,7 +119,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> { | |
| const scoped_refptr<TableInfo> &indexed_table, | ||
| std::vector<IndexInfoPB> indexes, | ||
| const scoped_refptr<NamespaceInfo> &ns_info, | ||
| LeaderEpoch epoch); | ||
| LeaderEpoch epoch, | ||
| std::optional<TransactionMetadata> requester_transaction); | ||
|
|
||
| Status Launch(); | ||
|
|
||
|
|
@@ -169,6 +177,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> { | |
|
|
||
| static void UnsetIndexTableRetainsDeleteMarkers(PersistentTableInfo* index_table); | ||
|
|
||
| Status Abort(); | ||
|
|
||
| private: | ||
| void LaunchBackfillOrAbort(); | ||
| Status WaitForTabletSplitting(); | ||
|
|
@@ -188,7 +198,8 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> { | |
| Status AlterTableStateToAbort(); | ||
| Status AlterTableStateToSuccess(); | ||
|
|
||
| Status Abort(); | ||
| void StartRequesterLivenessMonitor(); | ||
| void StopLivenessMonitor(); | ||
| Status CheckIfDone(); | ||
| Status UpdateIndexPermissionsForIndexes(); | ||
| Status ClearCheckpointStateInTablets(); | ||
|
|
@@ -230,8 +241,11 @@ class BackfillTable : public std::enable_shared_from_this<BackfillTable> { | |
| const scoped_refptr<NamespaceInfo> ns_info_; | ||
| LeaderEpoch epoch_; | ||
| ash::WaitStateInfoPtr wait_state_; | ||
| std::optional<TransactionMetadata> requester_transaction_; | ||
| std::shared_ptr<DdlRequesterLivenessTask> liveness_task_ GUARDED_BY(mutex_); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe this can be a std::weak_ptr? The task runs on its own, do we need to own it? Right now, both the task and backfilltable are holding refs to each other, so we really need to be sure we release both correctly.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @iSignal I think that the reference cycle is already explicitly broken because every exit path (MarkIndexesAsFailed, CheckIfDone) calls StopLivenessMonitor(), which moves liveness_task_ out and clears BackfillTable's reference to the task. Anther reference to the task is the TableInfo's task list and the task will hold the last
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it is safe now but it is a bit worrying that every future path would need to reason about and remember to call Stop... during Job termination paths to break the loop. If we write it as below to get a shared ptr out of the weak ptr, it would allow the task to exit by itself as well. But open to other suggestions as well
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My concern is about |
||
| }; | ||
|
|
||
|
|
||
| class BackfillTableJob : public server::MonitoredTask { | ||
| public: | ||
| explicit BackfillTableJob(std::shared_ptr<BackfillTable> backfill_table) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -874,6 +874,30 @@ class TableInfo : public RefCountedThreadSafe<TableInfo>, | |
| is_backfilling_ = false; | ||
| } | ||
|
|
||
| // Store/retrieve the DDL transaction from the PG backend that initiated the backfill. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After some back-and-forth with AI, got this (which I haven't fully verified in the interest of time): Review of
|
||
| // Stored when CatalogManager::BackfillIndex moves the index from WRITE_AND_DELETE to | ||
| // DO_BACKFILL; retrieved when StartBackfillingData actually creates the BackfillTable. | ||
| // schema_version must be the table version produced by the permission update (current + 1). | ||
| void SetPendingBackfillRequesterTransaction( | ||
|
egladysh marked this conversation as resolved.
|
||
| std::optional<TransactionMetadata> txn, uint32_t schema_version) { | ||
| std::lock_guard l(lock_); | ||
| pending_backfill_requester_transaction_ = std::move(txn); | ||
| pending_backfill_requester_transaction_version_ = schema_version; | ||
| } | ||
|
|
||
| // Returns the stored transaction and clears it, but only if schema_version matches the value | ||
| // passed to SetPendingBackfillRequesterTransaction. Returns nullopt otherwise so that a stale | ||
| // transaction from an earlier backfill attempt is never used for a later one. | ||
| std::optional<TransactionMetadata> TakePendingBackfillRequesterTransaction( | ||
| uint32_t schema_version) { | ||
| std::lock_guard l(lock_); | ||
| if (!pending_backfill_requester_transaction_ || | ||
| pending_backfill_requester_transaction_version_ != schema_version) { | ||
| return std::nullopt; | ||
| } | ||
| return std::exchange(pending_backfill_requester_transaction_, std::nullopt); | ||
| } | ||
|
|
||
| // Returns true if an "Alter" operation is in-progress. | ||
| Result<bool> IsAlterInProgress(uint32_t version) const; | ||
|
|
||
|
|
@@ -985,6 +1009,12 @@ class TableInfo : public RefCountedThreadSafe<TableInfo>, | |
| // In memory state set during backfill to prevent multiple backfill jobs. | ||
| bool is_backfilling_ = false; | ||
|
|
||
| // DDL transaction from the PG backend that initiated the backfill, and the table schema version | ||
| // at which it was stored. Set when BackfillIndex updates permissions (WRITE_AND_DELETE -> | ||
| // DO_BACKFILL) and cleared when StartBackfillingData creates the BackfillTable. | ||
| std::optional<TransactionMetadata> pending_backfill_requester_transaction_ GUARDED_BY(lock_); | ||
| uint32_t pending_backfill_requester_transaction_version_ GUARDED_BY(lock_) = 0; | ||
|
|
||
| std::atomic<bool> is_system_{false}; | ||
|
|
||
| const bool colocated_; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.